koichi12 commited on
Commit
6f8c8ab
·
verified ·
1 Parent(s): edb79de

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +1 -0
  2. .venv/lib/python3.11/site-packages/numpy/ma/tests/__pycache__/test_extras.cpython-311.pyc +3 -0
  3. .venv/lib/python3.11/site-packages/ray/dashboard/__init__.py +0 -0
  4. .venv/lib/python3.11/site-packages/ray/dashboard/agent.py +465 -0
  5. .venv/lib/python3.11/site-packages/ray/dashboard/consts.py +91 -0
  6. .venv/lib/python3.11/site-packages/ray/dashboard/dashboard.py +275 -0
  7. .venv/lib/python3.11/site-packages/ray/dashboard/dashboard_metrics.py +123 -0
  8. .venv/lib/python3.11/site-packages/ray/dashboard/datacenter.py +285 -0
  9. .venv/lib/python3.11/site-packages/ray/dashboard/head.py +351 -0
  10. .venv/lib/python3.11/site-packages/ray/dashboard/http_server_agent.py +83 -0
  11. .venv/lib/python3.11/site-packages/ray/dashboard/http_server_head.py +289 -0
  12. .venv/lib/python3.11/site-packages/ray/dashboard/k8s_utils.py +111 -0
  13. .venv/lib/python3.11/site-packages/ray/dashboard/memory_utils.py +524 -0
  14. .venv/lib/python3.11/site-packages/ray/dashboard/modules/__init__.py +0 -0
  15. .venv/lib/python3.11/site-packages/ray/dashboard/modules/dashboard_sdk.py +418 -0
  16. .venv/lib/python3.11/site-packages/ray/dashboard/modules/data/__pycache__/__init__.cpython-311.pyc +0 -0
  17. .venv/lib/python3.11/site-packages/ray/dashboard/modules/data/__pycache__/data_head.cpython-311.pyc +0 -0
  18. .venv/lib/python3.11/site-packages/ray/dashboard/modules/data/data_head.py +167 -0
  19. .venv/lib/python3.11/site-packages/ray/dashboard/modules/job/__init__.py +0 -0
  20. .venv/lib/python3.11/site-packages/ray/dashboard/modules/job/__pycache__/cli.cpython-311.pyc +0 -0
  21. .venv/lib/python3.11/site-packages/ray/dashboard/modules/job/__pycache__/cli_utils.cpython-311.pyc +0 -0
  22. .venv/lib/python3.11/site-packages/ray/dashboard/modules/job/__pycache__/job_agent.cpython-311.pyc +0 -0
  23. .venv/lib/python3.11/site-packages/ray/dashboard/modules/job/__pycache__/job_head.cpython-311.pyc +0 -0
  24. .venv/lib/python3.11/site-packages/ray/dashboard/modules/job/__pycache__/job_manager.cpython-311.pyc +0 -0
  25. .venv/lib/python3.11/site-packages/ray/dashboard/modules/job/__pycache__/job_supervisor.cpython-311.pyc +0 -0
  26. .venv/lib/python3.11/site-packages/ray/dashboard/modules/job/__pycache__/pydantic_models.cpython-311.pyc +0 -0
  27. .venv/lib/python3.11/site-packages/ray/dashboard/modules/job/cli.py +521 -0
  28. .venv/lib/python3.11/site-packages/ray/dashboard/modules/job/cli_utils.py +56 -0
  29. .venv/lib/python3.11/site-packages/ray/dashboard/modules/job/common.py +538 -0
  30. .venv/lib/python3.11/site-packages/ray/dashboard/modules/job/job_agent.py +211 -0
  31. .venv/lib/python3.11/site-packages/ray/dashboard/modules/job/job_head.py +587 -0
  32. .venv/lib/python3.11/site-packages/ray/dashboard/modules/job/job_log_storage_client.py +61 -0
  33. .venv/lib/python3.11/site-packages/ray/dashboard/modules/job/job_manager.py +640 -0
  34. .venv/lib/python3.11/site-packages/ray/dashboard/modules/job/job_supervisor.py +477 -0
  35. .venv/lib/python3.11/site-packages/ray/dashboard/modules/job/pydantic_models.py +110 -0
  36. .venv/lib/python3.11/site-packages/ray/dashboard/modules/job/sdk.py +492 -0
  37. .venv/lib/python3.11/site-packages/ray/dashboard/modules/job/utils.py +304 -0
  38. .venv/lib/python3.11/site-packages/ray/dashboard/modules/log/__init__.py +0 -0
  39. .venv/lib/python3.11/site-packages/ray/dashboard/modules/log/__pycache__/log_utils.cpython-311.pyc +0 -0
  40. .venv/lib/python3.11/site-packages/ray/dashboard/modules/log/log_agent.py +404 -0
  41. .venv/lib/python3.11/site-packages/ray/dashboard/modules/log/log_consts.py +8 -0
  42. .venv/lib/python3.11/site-packages/ray/dashboard/modules/log/log_manager.py +481 -0
  43. .venv/lib/python3.11/site-packages/ray/dashboard/modules/log/log_utils.py +9 -0
  44. .venv/lib/python3.11/site-packages/ray/dashboard/modules/node/__pycache__/node_head.cpython-311.pyc +0 -0
  45. .venv/lib/python3.11/site-packages/ray/dashboard/modules/serve/__init__.py +0 -0
  46. .venv/lib/python3.11/site-packages/ray/dashboard/modules/serve/__pycache__/__init__.cpython-311.pyc +0 -0
  47. .venv/lib/python3.11/site-packages/ray/dashboard/modules/serve/__pycache__/sdk.cpython-311.pyc +0 -0
  48. .venv/lib/python3.11/site-packages/ray/dashboard/modules/serve/__pycache__/serve_agent.cpython-311.pyc +0 -0
  49. .venv/lib/python3.11/site-packages/ray/dashboard/modules/serve/__pycache__/serve_head.cpython-311.pyc +0 -0
  50. .venv/lib/python3.11/site-packages/ray/dashboard/modules/serve/__pycache__/serve_rest_api_impl.cpython-311.pyc +0 -0
.gitattributes CHANGED
@@ -154,3 +154,4 @@ tuning-competition-baseline/.venv/lib/python3.11/site-packages/torch/_inductor/_
154
  .venv/lib/python3.11/site-packages/mistral_common/data/tekken_240911.json filter=lfs diff=lfs merge=lfs -text
155
  .venv/lib/python3.11/site-packages/ray/data/__pycache__/dataset.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
156
  .venv/lib/python3.11/site-packages/ray/data/__pycache__/read_api.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
 
 
154
  .venv/lib/python3.11/site-packages/mistral_common/data/tekken_240911.json filter=lfs diff=lfs merge=lfs -text
155
  .venv/lib/python3.11/site-packages/ray/data/__pycache__/dataset.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
156
  .venv/lib/python3.11/site-packages/ray/data/__pycache__/read_api.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
157
+ .venv/lib/python3.11/site-packages/numpy/ma/tests/__pycache__/test_extras.cpython-311.pyc filter=lfs diff=lfs merge=lfs -text
.venv/lib/python3.11/site-packages/numpy/ma/tests/__pycache__/test_extras.cpython-311.pyc ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7d5a3dfa5e053841216226fbf83943d5cd4c680ae8ea252c2354cd124c900752
3
+ size 143846
.venv/lib/python3.11/site-packages/ray/dashboard/__init__.py ADDED
File without changes
.venv/lib/python3.11/site-packages/ray/dashboard/agent.py ADDED
@@ -0,0 +1,465 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import asyncio
3
+ import json
4
+ import logging
5
+ import logging.handlers
6
+ import os
7
+ import pathlib
8
+ import signal
9
+ import sys
10
+
11
+ import ray
12
+ import ray._private.ray_constants as ray_constants
13
+ import ray._private.services
14
+ import ray._private.utils
15
+ import ray.dashboard.consts as dashboard_consts
16
+ import ray.dashboard.utils as dashboard_utils
17
+ from ray._private.gcs_utils import GcsAioClient
18
+ from ray._private.process_watcher import create_check_raylet_task
19
+ from ray._private.ray_constants import AGENT_GRPC_MAX_MESSAGE_LENGTH
20
+ from ray._private.ray_logging import configure_log_file, setup_component_logger
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ class DashboardAgent:
26
+ def __init__(
27
+ self,
28
+ node_ip_address,
29
+ dashboard_agent_port,
30
+ gcs_address,
31
+ cluster_id_hex,
32
+ minimal,
33
+ metrics_export_port=None,
34
+ node_manager_port=None,
35
+ listen_port=ray_constants.DEFAULT_DASHBOARD_AGENT_LISTEN_PORT,
36
+ disable_metrics_collection: bool = False,
37
+ *, # the following are required kwargs
38
+ object_store_name: str,
39
+ raylet_name: str,
40
+ log_dir: str,
41
+ temp_dir: str,
42
+ session_dir: str,
43
+ logging_params: dict,
44
+ agent_id: int,
45
+ session_name: str,
46
+ ):
47
+ """Initialize the DashboardAgent object."""
48
+ # Public attributes are accessible for all agent modules.
49
+ self.ip = node_ip_address
50
+ self.minimal = minimal
51
+
52
+ assert gcs_address is not None
53
+ self.gcs_address = gcs_address
54
+ self.cluster_id_hex = cluster_id_hex
55
+
56
+ self.temp_dir = temp_dir
57
+ self.session_dir = session_dir
58
+ self.log_dir = log_dir
59
+ self.dashboard_agent_port = dashboard_agent_port
60
+ self.metrics_export_port = metrics_export_port
61
+ self.node_manager_port = node_manager_port
62
+ self.listen_port = listen_port
63
+ self.object_store_name = object_store_name
64
+ self.raylet_name = raylet_name
65
+ self.logging_params = logging_params
66
+ self.node_id = os.environ["RAY_NODE_ID"]
67
+ self.metrics_collection_disabled = disable_metrics_collection
68
+ self.agent_id = agent_id
69
+ self.session_name = session_name
70
+
71
+ # grpc server is None in mininal.
72
+ self.server = None
73
+ # http_server is None in minimal.
74
+ self.http_server = None
75
+
76
+ # Used by the agent and sub-modules.
77
+ self.gcs_aio_client = GcsAioClient(
78
+ address=self.gcs_address,
79
+ nums_reconnect_retry=ray._config.gcs_rpc_server_reconnect_timeout_s(),
80
+ cluster_id=self.cluster_id_hex,
81
+ )
82
+
83
+ if not self.minimal:
84
+ self._init_non_minimal()
85
+
86
+ def _init_non_minimal(self):
87
+ from ray._private.gcs_pubsub import GcsAioPublisher
88
+ from ray.dashboard.http_server_agent import HttpServerAgent
89
+
90
+ self.aio_publisher = GcsAioPublisher(address=self.gcs_address)
91
+
92
+ try:
93
+ from grpc import aio as aiogrpc
94
+ except ImportError:
95
+ from grpc.experimental import aio as aiogrpc
96
+
97
+ # We would want to suppress deprecating warnings from aiogrpc library
98
+ # with the usage of asyncio.get_event_loop() in python version >=3.10
99
+ # This could be removed once https://github.com/grpc/grpc/issues/32526
100
+ # is released, and we used higher versions of grpcio that that.
101
+ if sys.version_info.major >= 3 and sys.version_info.minor >= 10:
102
+ import warnings
103
+
104
+ with warnings.catch_warnings():
105
+ warnings.simplefilter("ignore", category=DeprecationWarning)
106
+ aiogrpc.init_grpc_aio()
107
+ else:
108
+ aiogrpc.init_grpc_aio()
109
+
110
+ self.server = aiogrpc.server(
111
+ options=(
112
+ ("grpc.so_reuseport", 0),
113
+ (
114
+ "grpc.max_send_message_length",
115
+ AGENT_GRPC_MAX_MESSAGE_LENGTH,
116
+ ), # noqa
117
+ (
118
+ "grpc.max_receive_message_length",
119
+ AGENT_GRPC_MAX_MESSAGE_LENGTH,
120
+ ),
121
+ ) # noqa
122
+ )
123
+ grpc_ip = "127.0.0.1" if self.ip == "127.0.0.1" else "0.0.0.0"
124
+ try:
125
+ self.grpc_port = ray._private.tls_utils.add_port_to_grpc_server(
126
+ self.server, f"{grpc_ip}:{self.dashboard_agent_port}"
127
+ )
128
+ except Exception:
129
+ # TODO(SongGuyang): Catch the exception here because there is
130
+ # port conflict issue which brought from static port. We should
131
+ # remove this after we find better port resolution.
132
+ logger.exception(
133
+ "Failed to add port to grpc server. Agent will stay alive but "
134
+ "disable the grpc service."
135
+ )
136
+ self.server = None
137
+ self.grpc_port = None
138
+ else:
139
+ logger.info("Dashboard agent grpc address: %s:%s", grpc_ip, self.grpc_port)
140
+
141
+ # If the agent is not minimal it should start the http server
142
+ # to communicate with the dashboard in a head node.
143
+ # Http server is not started in the minimal version because
144
+ # it requires additional dependencies that are not
145
+ # included in the minimal ray package.
146
+ self.http_server = HttpServerAgent(self.ip, self.listen_port)
147
+
148
+ def _load_modules(self):
149
+ """Load dashboard agent modules."""
150
+ modules = []
151
+ agent_cls_list = dashboard_utils.get_all_modules(
152
+ dashboard_utils.DashboardAgentModule
153
+ )
154
+ for cls in agent_cls_list:
155
+ logger.info(
156
+ "Loading %s: %s", dashboard_utils.DashboardAgentModule.__name__, cls
157
+ )
158
+ c = cls(self)
159
+ modules.append(c)
160
+ logger.info("Loaded %d modules.", len(modules))
161
+ return modules
162
+
163
+ @property
164
+ def http_session(self):
165
+ assert (
166
+ self.http_server
167
+ ), "Accessing unsupported API (HttpServerAgent) in a minimal ray."
168
+ return self.http_server.http_session
169
+
170
+ @property
171
+ def publisher(self):
172
+ assert (
173
+ self.aio_publisher
174
+ ), "Accessing unsupported API (GcsAioPublisher) in a minimal ray."
175
+ return self.aio_publisher
176
+
177
+ def get_node_id(self) -> str:
178
+ return self.node_id
179
+
180
+ async def run(self):
181
+ # Start a grpc asyncio server.
182
+ if self.server:
183
+ await self.server.start()
184
+
185
+ modules = self._load_modules()
186
+
187
+ if self.http_server:
188
+ try:
189
+ await self.http_server.start(modules)
190
+ except Exception:
191
+ # TODO(SongGuyang): Catch the exception here because there is
192
+ # port conflict issue which brought from static port. We should
193
+ # remove this after we find better port resolution.
194
+ logger.exception(
195
+ "Failed to start http server. Agent will stay alive but "
196
+ "disable the http service."
197
+ )
198
+
199
+ # Writes agent address to kv.
200
+ # DASHBOARD_AGENT_ADDR_NODE_ID_PREFIX: <node_id> -> (ip, http_port, grpc_port)
201
+ # DASHBOARD_AGENT_ADDR_IP_PREFIX: <ip> -> (node_id, http_port, grpc_port)
202
+ # -1 should indicate that http server is not started.
203
+ http_port = -1 if not self.http_server else self.http_server.http_port
204
+ grpc_port = -1 if not self.server else self.grpc_port
205
+ put_by_node_id = self.gcs_aio_client.internal_kv_put(
206
+ f"{dashboard_consts.DASHBOARD_AGENT_ADDR_NODE_ID_PREFIX}{self.node_id}".encode(),
207
+ json.dumps([self.ip, http_port, grpc_port]).encode(),
208
+ True,
209
+ namespace=ray_constants.KV_NAMESPACE_DASHBOARD,
210
+ )
211
+ put_by_ip = self.gcs_aio_client.internal_kv_put(
212
+ f"{dashboard_consts.DASHBOARD_AGENT_ADDR_IP_PREFIX}{self.ip}".encode(),
213
+ json.dumps([self.node_id, http_port, grpc_port]).encode(),
214
+ True,
215
+ namespace=ray_constants.KV_NAMESPACE_DASHBOARD,
216
+ )
217
+
218
+ await asyncio.gather(put_by_node_id, put_by_ip)
219
+
220
+ tasks = [m.run(self.server) for m in modules]
221
+
222
+ if sys.platform not in ["win32", "cygwin"]:
223
+
224
+ def callback(msg):
225
+ logger.info(
226
+ f"Terminated Raylet: ip={self.ip}, node_id={self.node_id}. {msg}"
227
+ )
228
+
229
+ check_parent_task = create_check_raylet_task(
230
+ self.log_dir, self.gcs_address, callback, loop
231
+ )
232
+ tasks.append(check_parent_task)
233
+
234
+ if self.server:
235
+ tasks.append(self.server.wait_for_termination())
236
+ else:
237
+
238
+ async def wait_forever():
239
+ while True:
240
+ await asyncio.sleep(3600)
241
+
242
+ tasks.append(wait_forever())
243
+
244
+ await asyncio.gather(*tasks)
245
+
246
+ if self.http_server:
247
+ await self.http_server.cleanup()
248
+
249
+
250
+ def open_capture_files(log_dir):
251
+ filename = f"agent-{args.agent_id}"
252
+ return (
253
+ ray._private.utils.open_log(pathlib.Path(log_dir) / f"{filename}.out"),
254
+ ray._private.utils.open_log(pathlib.Path(log_dir) / f"{filename}.err"),
255
+ )
256
+
257
+
258
+ if __name__ == "__main__":
259
+ parser = argparse.ArgumentParser(description="Dashboard agent.")
260
+ parser.add_argument(
261
+ "--node-ip-address",
262
+ required=True,
263
+ type=str,
264
+ help="the IP address of this node.",
265
+ )
266
+ parser.add_argument(
267
+ "--gcs-address", required=True, type=str, help="The address (ip:port) of GCS."
268
+ )
269
+ parser.add_argument(
270
+ "--cluster-id-hex",
271
+ required=True,
272
+ type=str,
273
+ help="The cluster id in hex.",
274
+ )
275
+ parser.add_argument(
276
+ "--metrics-export-port",
277
+ required=True,
278
+ type=int,
279
+ help="The port to expose metrics through Prometheus.",
280
+ )
281
+ parser.add_argument(
282
+ "--dashboard-agent-port",
283
+ required=True,
284
+ type=int,
285
+ help="The port on which the dashboard agent will receive GRPCs.",
286
+ )
287
+ parser.add_argument(
288
+ "--node-manager-port",
289
+ required=True,
290
+ type=int,
291
+ help="The port to use for starting the node manager",
292
+ )
293
+ parser.add_argument(
294
+ "--object-store-name",
295
+ required=True,
296
+ type=str,
297
+ default=None,
298
+ help="The socket name of the plasma store",
299
+ )
300
+ parser.add_argument(
301
+ "--listen-port",
302
+ required=False,
303
+ type=int,
304
+ default=ray_constants.DEFAULT_DASHBOARD_AGENT_LISTEN_PORT,
305
+ help="Port for HTTP server to listen on",
306
+ )
307
+ parser.add_argument(
308
+ "--raylet-name",
309
+ required=True,
310
+ type=str,
311
+ default=None,
312
+ help="The socket path of the raylet process",
313
+ )
314
+ parser.add_argument(
315
+ "--logging-level",
316
+ required=False,
317
+ type=lambda s: logging.getLevelName(s.upper()),
318
+ default=ray_constants.LOGGER_LEVEL,
319
+ choices=ray_constants.LOGGER_LEVEL_CHOICES,
320
+ help=ray_constants.LOGGER_LEVEL_HELP,
321
+ )
322
+ parser.add_argument(
323
+ "--logging-format",
324
+ required=False,
325
+ type=str,
326
+ default=ray_constants.LOGGER_FORMAT,
327
+ help=ray_constants.LOGGER_FORMAT_HELP,
328
+ )
329
+ parser.add_argument(
330
+ "--logging-filename",
331
+ required=False,
332
+ type=str,
333
+ default=dashboard_consts.DASHBOARD_AGENT_LOG_FILENAME,
334
+ help="Specify the name of log file, "
335
+ 'log to stdout if set empty, default is "{}".'.format(
336
+ dashboard_consts.DASHBOARD_AGENT_LOG_FILENAME
337
+ ),
338
+ )
339
+ parser.add_argument(
340
+ "--logging-rotate-bytes",
341
+ required=False,
342
+ type=int,
343
+ default=ray_constants.LOGGING_ROTATE_BYTES,
344
+ help="Specify the max bytes for rotating "
345
+ "log file, default is {} bytes.".format(ray_constants.LOGGING_ROTATE_BYTES),
346
+ )
347
+ parser.add_argument(
348
+ "--logging-rotate-backup-count",
349
+ required=False,
350
+ type=int,
351
+ default=ray_constants.LOGGING_ROTATE_BACKUP_COUNT,
352
+ help="Specify the backup count of rotated log file, default is {}.".format(
353
+ ray_constants.LOGGING_ROTATE_BACKUP_COUNT
354
+ ),
355
+ )
356
+ parser.add_argument(
357
+ "--log-dir",
358
+ required=True,
359
+ type=str,
360
+ default=None,
361
+ help="Specify the path of log directory.",
362
+ )
363
+ parser.add_argument(
364
+ "--temp-dir",
365
+ required=True,
366
+ type=str,
367
+ default=None,
368
+ help="Specify the path of the temporary directory use by Ray process.",
369
+ )
370
+ parser.add_argument(
371
+ "--session-dir",
372
+ required=True,
373
+ type=str,
374
+ default=None,
375
+ help="Specify the path of this session.",
376
+ )
377
+
378
+ parser.add_argument(
379
+ "--minimal",
380
+ action="store_true",
381
+ help=(
382
+ "Minimal agent only contains a subset of features that don't "
383
+ "require additional dependencies installed when ray is installed "
384
+ "by `pip install 'ray[default]'`."
385
+ ),
386
+ )
387
+ parser.add_argument(
388
+ "--disable-metrics-collection",
389
+ action="store_true",
390
+ help=("If this arg is set, metrics report won't be enabled from the agent."),
391
+ )
392
+ parser.add_argument(
393
+ "--agent-id",
394
+ required=True,
395
+ type=int,
396
+ help="ID to report when registering with raylet",
397
+ default=os.getpid(),
398
+ )
399
+ parser.add_argument(
400
+ "--session-name",
401
+ required=False,
402
+ type=str,
403
+ default=None,
404
+ help="The session name (cluster id) of this cluster.",
405
+ )
406
+
407
+ args = parser.parse_args()
408
+
409
+ try:
410
+ logging_params = dict(
411
+ logging_level=args.logging_level,
412
+ logging_format=args.logging_format,
413
+ log_dir=args.log_dir,
414
+ filename=args.logging_filename,
415
+ max_bytes=args.logging_rotate_bytes,
416
+ backup_count=args.logging_rotate_backup_count,
417
+ )
418
+ logger = setup_component_logger(**logging_params)
419
+
420
+ # Initialize event loop, see Dashboard init code for caveat
421
+ # w.r.t grpc server init in the DashboardAgent initializer.
422
+ loop = ray._private.utils.get_or_create_event_loop()
423
+
424
+ # Setup stdout/stderr redirect files
425
+ out_file, err_file = open_capture_files(args.log_dir)
426
+ configure_log_file(out_file, err_file)
427
+
428
+ agent = DashboardAgent(
429
+ args.node_ip_address,
430
+ args.dashboard_agent_port,
431
+ args.gcs_address,
432
+ args.cluster_id_hex,
433
+ args.minimal,
434
+ temp_dir=args.temp_dir,
435
+ session_dir=args.session_dir,
436
+ log_dir=args.log_dir,
437
+ metrics_export_port=args.metrics_export_port,
438
+ node_manager_port=args.node_manager_port,
439
+ listen_port=args.listen_port,
440
+ object_store_name=args.object_store_name,
441
+ raylet_name=args.raylet_name,
442
+ logging_params=logging_params,
443
+ disable_metrics_collection=args.disable_metrics_collection,
444
+ agent_id=args.agent_id,
445
+ session_name=args.session_name,
446
+ )
447
+
448
+ def sigterm_handler():
449
+ logger.warning("Exiting with SIGTERM immediately...")
450
+ # Exit code 0 will be considered as an expected shutdown
451
+ os._exit(signal.SIGTERM)
452
+
453
+ if sys.platform != "win32":
454
+ # TODO(rickyyx): we currently do not have any logic for actual
455
+ # graceful termination in the agent. Most of the underlying
456
+ # async tasks run by the agent head doesn't handle CancelledError.
457
+ # So a truly graceful shutdown is not trivial w/o much refactoring.
458
+ # Re-open the issue: https://github.com/ray-project/ray/issues/25518
459
+ # if a truly graceful shutdown is required.
460
+ loop.add_signal_handler(signal.SIGTERM, sigterm_handler)
461
+
462
+ loop.run_until_complete(agent.run())
463
+ except Exception:
464
+ logger.exception("Agent is working abnormally. It will exit immediately.")
465
+ exit(1)
.venv/lib/python3.11/site-packages/ray/dashboard/consts.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from ray._private.ray_constants import env_bool, env_integer
4
+
5
+ DASHBOARD_LOG_FILENAME = "dashboard.log"
6
+ DASHBOARD_AGENT_ADDR_NODE_ID_PREFIX = "DASHBOARD_AGENT_ADDR_NODE_ID_PREFIX:"
7
+ DASHBOARD_AGENT_ADDR_IP_PREFIX = "DASHBOARD_AGENT_ADDR_IP_PREFIX:"
8
+ DASHBOARD_AGENT_LOG_FILENAME = "dashboard_agent.log"
9
+ DASHBOARD_AGENT_CHECK_PARENT_INTERVAL_S_ENV_NAME = (
10
+ "RAY_DASHBOARD_AGENT_CHECK_PARENT_INTERVAL_S" # noqa
11
+ )
12
+ DASHBOARD_AGENT_CHECK_PARENT_INTERVAL_S = env_integer(
13
+ DASHBOARD_AGENT_CHECK_PARENT_INTERVAL_S_ENV_NAME, 0.4
14
+ )
15
+ # The maximum time that parent can be considered
16
+ # as dead before agent kills itself.
17
+ _PARENT_DEATH_THREASHOLD = 5
18
+ RAY_STATE_SERVER_MAX_HTTP_REQUEST_ENV_NAME = "RAY_STATE_SERVER_MAX_HTTP_REQUEST"
19
+ # Default number of in-progress requests to the state api server.
20
+ RAY_STATE_SERVER_MAX_HTTP_REQUEST = env_integer(
21
+ RAY_STATE_SERVER_MAX_HTTP_REQUEST_ENV_NAME, 100
22
+ )
23
+ # Max allowed number of in-progress requests could be configured.
24
+ RAY_STATE_SERVER_MAX_HTTP_REQUEST_ALLOWED = 1000
25
+
26
+ RAY_DASHBOARD_STATS_PURGING_INTERVAL = env_integer(
27
+ "RAY_DASHBOARD_STATS_PURGING_INTERVAL", 60 * 10
28
+ )
29
+ RAY_DASHBOARD_STATS_UPDATING_INTERVAL = env_integer(
30
+ "RAY_DASHBOARD_STATS_UPDATING_INTERVAL", 15
31
+ )
32
+ DASHBOARD_RPC_ADDRESS = "dashboard_rpc"
33
+ DASHBOARD_RPC_PORT = env_integer("RAY_DASHBOARD_RPC_PORT", 0)
34
+ GCS_SERVER_ADDRESS = "GcsServerAddress"
35
+ # GCS check alive
36
+ GCS_CHECK_ALIVE_INTERVAL_SECONDS = env_integer("GCS_CHECK_ALIVE_INTERVAL_SECONDS", 5)
37
+ GCS_RPC_TIMEOUT_SECONDS = env_integer("RAY_DASHBOARD_GCS_RPC_TIMEOUT_SECONDS", 60)
38
+ # aiohttp_cache
39
+ AIOHTTP_CACHE_TTL_SECONDS = 2
40
+ AIOHTTP_CACHE_MAX_SIZE = 128
41
+ AIOHTTP_CACHE_DISABLE_ENVIRONMENT_KEY = "RAY_DASHBOARD_NO_CACHE"
42
+ # Default value for datacenter (the default value in protobuf)
43
+ DEFAULT_LANGUAGE = "PYTHON"
44
+ DEFAULT_JOB_ID = "ffff"
45
+ # Hook that is invoked on the dashboard `/api/component_activities` endpoint.
46
+ # Environment variable stored here should be a callable that does not
47
+ # take any arguments and should return a dictionary mapping
48
+ # activity component type (str) to
49
+ # ray.dashboard.modules.snapshot.snapshot_head.RayActivityResponse.
50
+ # Example: "your.module.ray_cluster_activity_hook".
51
+ RAY_CLUSTER_ACTIVITY_HOOK = "RAY_CLUSTER_ACTIVITY_HOOK"
52
+
53
+ # The number of candidate agents
54
+ CANDIDATE_AGENT_NUMBER = max(env_integer("CANDIDATE_AGENT_NUMBER", 1), 1)
55
+ # when head receive JobSubmitRequest, maybe not any agent is available,
56
+ # we need to wait for agents in other node start
57
+ WAIT_AVAILABLE_AGENT_TIMEOUT = 10
58
+ TRY_TO_GET_AGENT_INFO_INTERVAL_SECONDS = 0.5
59
+ RAY_JOB_ALLOW_DRIVER_ON_WORKER_NODES_ENV_VAR = "RAY_JOB_ALLOW_DRIVER_ON_WORKER_NODES"
60
+ RAY_STREAM_RUNTIME_ENV_LOG_TO_JOB_DRIVER_LOG_ENV_VAR = (
61
+ "RAY_STREAM_RUNTIME_ENV_LOG_TO_JOB_DRIVER_LOG"
62
+ )
63
+
64
+ # The max time to wait for the JobSupervisor to start before failing the job.
65
+ DEFAULT_JOB_START_TIMEOUT_SECONDS = 60 * 15
66
+ RAY_JOB_START_TIMEOUT_SECONDS_ENV_VAR = "RAY_JOB_START_TIMEOUT_SECONDS"
67
+ # Port that dashboard prometheus metrics will be exported to
68
+ DASHBOARD_METRIC_PORT = env_integer("DASHBOARD_METRIC_PORT", 44227)
69
+
70
+ NODE_TAG_KEYS = ["ip", "Version", "SessionName", "IsHeadNode"]
71
+ GPU_TAG_KEYS = NODE_TAG_KEYS + ["GpuDeviceName", "GpuIndex"]
72
+ CLUSTER_TAG_KEYS = ["node_type", "Version", "SessionName"]
73
+ COMPONENT_METRICS_TAG_KEYS = ["ip", "pid", "Version", "Component", "SessionName"]
74
+
75
+ # Dashboard metrics are tracked separately at the dashboard. TODO(sang): Support GCS.
76
+ AVAILABLE_COMPONENT_NAMES_FOR_METRICS = {
77
+ "workers",
78
+ "raylet",
79
+ "agent",
80
+ "dashboard",
81
+ "gcs",
82
+ }
83
+ METRICS_INPUT_ROOT = os.path.join(
84
+ os.path.dirname(__file__), "modules", "metrics", "export"
85
+ )
86
+ PROMETHEUS_CONFIG_INPUT_PATH = os.path.join(
87
+ METRICS_INPUT_ROOT, "prometheus", "prometheus.yml"
88
+ )
89
+ PARENT_HEALTH_CHECK_BY_PIPE = env_bool(
90
+ "RAY_enable_pipe_based_agent_to_parent_health_check", False
91
+ )
.venv/lib/python3.11/site-packages/ray/dashboard/dashboard.py ADDED
@@ -0,0 +1,275 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import logging
3
+ import logging.handlers
4
+ import os
5
+ import platform
6
+ import signal
7
+ import sys
8
+ import traceback
9
+ from typing import Optional, Set
10
+
11
+ import ray._private.ray_constants as ray_constants
12
+ import ray._private.services
13
+ import ray._private.utils
14
+ import ray.dashboard.consts as dashboard_consts
15
+ import ray.dashboard.head as dashboard_head
16
+ import ray.dashboard.utils as dashboard_utils
17
+ from ray._private.ray_logging import setup_component_logger
18
+
19
+ # Logger for this module. It should be configured at the entry point
20
+ # into the program using Ray. Ray provides a default configuration at
21
+ # entry/init points.
22
+ logger = logging.getLogger(__name__)
23
+
24
+
25
+ class Dashboard:
26
+ """A dashboard process for monitoring Ray nodes.
27
+
28
+ This dashboard is made up of a REST API which collates data published by
29
+ Reporter processes on nodes into a json structure, and a webserver
30
+ which polls said API for display purposes.
31
+
32
+ Args:
33
+ host: Host address of dashboard aiohttp server.
34
+ port: Port number of dashboard aiohttp server.
35
+ port_retries: The retry times to select a valid port.
36
+ gcs_address: GCS address of the cluster.
37
+ cluster_id_hex: Cluster ID hex string.
38
+ grpc_port: Port used to listen for gRPC on.
39
+ node_ip_address: The IP address of the dashboard.
40
+ serve_frontend: If configured, frontend HTML
41
+ is not served from the dashboard.
42
+ log_dir: Log directory of dashboard.
43
+ """
44
+
45
+ def __init__(
46
+ self,
47
+ host: str,
48
+ port: int,
49
+ port_retries: int,
50
+ gcs_address: str,
51
+ cluster_id_hex: str,
52
+ grpc_port: int,
53
+ node_ip_address: str,
54
+ log_dir: str = None,
55
+ temp_dir: str = None,
56
+ session_dir: str = None,
57
+ minimal: bool = False,
58
+ serve_frontend: bool = True,
59
+ modules_to_load: Optional[Set[str]] = None,
60
+ ):
61
+ self.dashboard_head = dashboard_head.DashboardHead(
62
+ http_host=host,
63
+ http_port=port,
64
+ http_port_retries=port_retries,
65
+ gcs_address=gcs_address,
66
+ cluster_id_hex=cluster_id_hex,
67
+ node_ip_address=node_ip_address,
68
+ grpc_port=grpc_port,
69
+ log_dir=log_dir,
70
+ temp_dir=temp_dir,
71
+ session_dir=session_dir,
72
+ minimal=minimal,
73
+ serve_frontend=serve_frontend,
74
+ modules_to_load=modules_to_load,
75
+ )
76
+
77
+ async def run(self):
78
+ await self.dashboard_head.run()
79
+
80
+
81
+ if __name__ == "__main__":
82
+ parser = argparse.ArgumentParser(description="Ray dashboard.")
83
+ parser.add_argument(
84
+ "--host", required=True, type=str, help="The host to use for the HTTP server."
85
+ )
86
+ parser.add_argument(
87
+ "--port", required=True, type=int, help="The port to use for the HTTP server."
88
+ )
89
+ parser.add_argument(
90
+ "--port-retries",
91
+ required=False,
92
+ type=int,
93
+ default=0,
94
+ help="The retry times to select a valid port.",
95
+ )
96
+ parser.add_argument(
97
+ "--gcs-address", required=True, type=str, help="The address (ip:port) of GCS."
98
+ )
99
+ parser.add_argument(
100
+ "--cluster-id-hex", required=True, type=str, help="The cluster ID in hex."
101
+ )
102
+ parser.add_argument(
103
+ "--grpc-port",
104
+ required=False,
105
+ type=int,
106
+ default=dashboard_consts.DASHBOARD_RPC_PORT,
107
+ help="The port for the dashboard to listen for gRPC on.",
108
+ )
109
+ parser.add_argument(
110
+ "--node-ip-address",
111
+ required=True,
112
+ type=str,
113
+ help="The IP address of the node where this is running.",
114
+ )
115
+ parser.add_argument(
116
+ "--logging-level",
117
+ required=False,
118
+ type=lambda s: logging.getLevelName(s.upper()),
119
+ default=ray_constants.LOGGER_LEVEL,
120
+ choices=ray_constants.LOGGER_LEVEL_CHOICES,
121
+ help=ray_constants.LOGGER_LEVEL_HELP,
122
+ )
123
+ parser.add_argument(
124
+ "--logging-format",
125
+ required=False,
126
+ type=str,
127
+ default=ray_constants.LOGGER_FORMAT,
128
+ help=ray_constants.LOGGER_FORMAT_HELP,
129
+ )
130
+ parser.add_argument(
131
+ "--logging-filename",
132
+ required=False,
133
+ type=str,
134
+ default=dashboard_consts.DASHBOARD_LOG_FILENAME,
135
+ help="Specify the name of log file, "
136
+ 'log to stdout if set empty, default is "{}"'.format(
137
+ dashboard_consts.DASHBOARD_LOG_FILENAME
138
+ ),
139
+ )
140
+ parser.add_argument(
141
+ "--logging-rotate-bytes",
142
+ required=False,
143
+ type=int,
144
+ default=ray_constants.LOGGING_ROTATE_BYTES,
145
+ help="Specify the max bytes for rotating "
146
+ "log file, default is {} bytes.".format(ray_constants.LOGGING_ROTATE_BYTES),
147
+ )
148
+ parser.add_argument(
149
+ "--logging-rotate-backup-count",
150
+ required=False,
151
+ type=int,
152
+ default=ray_constants.LOGGING_ROTATE_BACKUP_COUNT,
153
+ help="Specify the backup count of rotated log file, default is {}.".format(
154
+ ray_constants.LOGGING_ROTATE_BACKUP_COUNT
155
+ ),
156
+ )
157
+ parser.add_argument(
158
+ "--log-dir",
159
+ required=True,
160
+ type=str,
161
+ default=None,
162
+ help="Specify the path of log directory.",
163
+ )
164
+ parser.add_argument(
165
+ "--temp-dir",
166
+ required=True,
167
+ type=str,
168
+ default=None,
169
+ help="Specify the path of the temporary directory use by Ray process.",
170
+ )
171
+ parser.add_argument(
172
+ "--session-dir",
173
+ required=True,
174
+ type=str,
175
+ default=None,
176
+ help="Specify the path of the session directory of the cluster.",
177
+ )
178
+ parser.add_argument(
179
+ "--minimal",
180
+ action="store_true",
181
+ help=(
182
+ "Minimal dashboard only contains a subset of features that don't "
183
+ "require additional dependencies installed when ray is installed "
184
+ "by `pip install ray[default]`."
185
+ ),
186
+ )
187
+ parser.add_argument(
188
+ "--modules-to-load",
189
+ required=False,
190
+ default=None,
191
+ help=(
192
+ "Specify the list of module names in [module_1],[module_2] format."
193
+ "E.g., JobHead,StateHead... "
194
+ "If nothing is specified, all modules are loaded."
195
+ ),
196
+ )
197
+ parser.add_argument(
198
+ "--disable-frontend",
199
+ action="store_true",
200
+ help=("If configured, frontend html is not served from the server."),
201
+ )
202
+
203
+ args = parser.parse_args()
204
+
205
+ try:
206
+ setup_component_logger(
207
+ logging_level=args.logging_level,
208
+ logging_format=args.logging_format,
209
+ log_dir=args.log_dir,
210
+ filename=args.logging_filename,
211
+ max_bytes=args.logging_rotate_bytes,
212
+ backup_count=args.logging_rotate_backup_count,
213
+ )
214
+
215
+ if args.modules_to_load:
216
+ modules_to_load = set(args.modules_to_load.strip(" ,").split(","))
217
+ else:
218
+ # None == default.
219
+ modules_to_load = None
220
+
221
+ # NOTE: Creating and attaching the event loop to the main OS thread be called
222
+ # before initializing Dashboard, which will initialize the grpc aio server,
223
+ # which assumes a working event loop. Ref:
224
+ # https://github.com/grpc/grpc/blob/master/src/python/grpcio/grpc/_cython/_cygrpc/aio/common.pyx.pxi#L174-L188
225
+ loop = ray._private.utils.get_or_create_event_loop()
226
+ dashboard = Dashboard(
227
+ host=args.host,
228
+ port=args.port,
229
+ port_retries=args.port_retries,
230
+ gcs_address=args.gcs_address,
231
+ cluster_id_hex=args.cluster_id_hex,
232
+ grpc_port=args.grpc_port,
233
+ node_ip_address=args.node_ip_address,
234
+ log_dir=args.log_dir,
235
+ temp_dir=args.temp_dir,
236
+ session_dir=args.session_dir,
237
+ minimal=args.minimal,
238
+ serve_frontend=(not args.disable_frontend),
239
+ modules_to_load=modules_to_load,
240
+ )
241
+
242
+ def sigterm_handler():
243
+ logger.warning("Exiting with SIGTERM immediately...")
244
+ os._exit(signal.SIGTERM)
245
+
246
+ if sys.platform != "win32":
247
+ # TODO(rickyyx): we currently do not have any logic for actual
248
+ # graceful termination in the dashboard. Most of the underlying
249
+ # async tasks run by the dashboard head doesn't handle CancelledError.
250
+ # So a truly graceful shutdown is not trivial w/o much refactoring.
251
+ # Re-open the issue: https://github.com/ray-project/ray/issues/25518
252
+ # if a truly graceful shutdown is required.
253
+ loop.add_signal_handler(signal.SIGTERM, sigterm_handler)
254
+
255
+ loop.run_until_complete(dashboard.run())
256
+ except Exception as e:
257
+ traceback_str = ray._private.utils.format_error_message(traceback.format_exc())
258
+ message = (
259
+ f"The dashboard on node {platform.uname()[1]} "
260
+ f"failed with the following "
261
+ f"error:\n{traceback_str}"
262
+ )
263
+ if isinstance(e, dashboard_utils.FrontendNotFoundError):
264
+ logger.warning(message)
265
+ else:
266
+ logger.error(message)
267
+ raise e
268
+
269
+ # Something went wrong, so push an error to all drivers.
270
+ gcs_publisher = ray._raylet.GcsPublisher(address=args.gcs_address)
271
+ ray._private.utils.publish_error_to_driver(
272
+ ray_constants.DASHBOARD_DIED_ERROR,
273
+ message,
274
+ gcs_publisher=gcs_publisher,
275
+ )
.venv/lib/python3.11/site-packages/ray/dashboard/dashboard_metrics.py ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Optional
2
+
3
+ from ray.dashboard.consts import COMPONENT_METRICS_TAG_KEYS
4
+
5
+
6
+ class NullMetric:
7
+ """Mock metric class to be used in case of prometheus_client import error."""
8
+
9
+ def set(self, *args, **kwargs):
10
+ pass
11
+
12
+ def observe(self, *args, **kwargs):
13
+ pass
14
+
15
+ def inc(self, *args, **kwargs):
16
+ pass
17
+
18
+
19
+ try:
20
+
21
+ from prometheus_client import CollectorRegistry, Counter, Gauge, Histogram
22
+
23
+ # The metrics in this class should be kept in sync with
24
+ # python/ray/tests/test_metrics_agent.py
25
+ class DashboardPrometheusMetrics:
26
+ def __init__(self, registry: Optional[CollectorRegistry] = None):
27
+ self.registry: CollectorRegistry = registry or CollectorRegistry(
28
+ auto_describe=True
29
+ )
30
+ # Buckets: 5ms, 10ms, 25ms, 50ms, 75ms
31
+ # 100ms, 250ms, 500ms, 750ms
32
+ # 1s, 2.5s, 5s, 7.5s, 10s
33
+ # 20s, 40s, 60s
34
+ # used for API duration
35
+ histogram_buckets_s = [
36
+ 0.005,
37
+ 0.01,
38
+ 0.025,
39
+ 0.05,
40
+ 0.075,
41
+ 0.1,
42
+ 0.25,
43
+ 0.5,
44
+ 0.75,
45
+ 1,
46
+ 2.5,
47
+ 5,
48
+ 7.5,
49
+ 10,
50
+ 20,
51
+ 40,
52
+ 60,
53
+ ]
54
+ self.metrics_request_duration = Histogram(
55
+ "dashboard_api_requests_duration_seconds",
56
+ "Total duration in seconds per endpoint",
57
+ ("endpoint", "http_status", "Version", "SessionName", "Component"),
58
+ unit="seconds",
59
+ namespace="ray",
60
+ registry=self.registry,
61
+ buckets=histogram_buckets_s,
62
+ )
63
+ self.metrics_request_count = Counter(
64
+ "dashboard_api_requests_count",
65
+ "Total requests count per endpoint",
66
+ (
67
+ "method",
68
+ "endpoint",
69
+ "http_status",
70
+ "Version",
71
+ "SessionName",
72
+ "Component",
73
+ ),
74
+ unit="requests",
75
+ namespace="ray",
76
+ registry=self.registry,
77
+ )
78
+ self.metrics_event_loop_tasks = Gauge(
79
+ "dashboard_event_loop_tasks",
80
+ "Number of tasks currently pending in the event loop's queue.",
81
+ tuple(COMPONENT_METRICS_TAG_KEYS),
82
+ unit="tasks",
83
+ namespace="ray",
84
+ registry=self.registry,
85
+ )
86
+ self.metrics_event_loop_lag = Gauge(
87
+ "dashboard_event_loop_lag",
88
+ "Event loop lag in seconds.",
89
+ tuple(COMPONENT_METRICS_TAG_KEYS),
90
+ unit="seconds",
91
+ namespace="ray",
92
+ registry=self.registry,
93
+ )
94
+ self.metrics_dashboard_cpu = Gauge(
95
+ "component_cpu",
96
+ "Dashboard CPU percentage usage.",
97
+ tuple(COMPONENT_METRICS_TAG_KEYS),
98
+ unit="percentage",
99
+ namespace="ray",
100
+ registry=self.registry,
101
+ )
102
+ self.metrics_dashboard_mem_uss = Gauge(
103
+ "component_uss",
104
+ "USS usage of all components on the node.",
105
+ tuple(COMPONENT_METRICS_TAG_KEYS),
106
+ unit="mb",
107
+ namespace="ray",
108
+ registry=self.registry,
109
+ )
110
+ self.metrics_dashboard_mem_rss = Gauge(
111
+ "component_rss",
112
+ "RSS usage of all components on the node.",
113
+ tuple(COMPONENT_METRICS_TAG_KEYS),
114
+ unit="mb",
115
+ namespace="ray",
116
+ registry=self.registry,
117
+ )
118
+
119
+ except ImportError:
120
+
121
+ class DashboardPrometheusMetrics(object):
122
+ def __getattr__(self, attr):
123
+ return NullMetric()
.venv/lib/python3.11/site-packages/ray/dashboard/datacenter.py ADDED
@@ -0,0 +1,285 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ from typing import Any, List, Optional
3
+
4
+ import ray.dashboard.consts as dashboard_consts
5
+ from ray._private.utils import (
6
+ get_or_create_event_loop,
7
+ parse_pg_formatted_resources_to_original,
8
+ )
9
+ from ray.dashboard.utils import (
10
+ Dict,
11
+ MutableNotificationDict,
12
+ async_loop_forever,
13
+ compose_state_message,
14
+ )
15
+
16
+ logger = logging.getLogger(__name__)
17
+
18
+
19
+ # NOT thread safe. Every assignment must be on the main event loop thread.
20
+ class DataSource:
21
+ # {node id hex(str): node stats(dict of GetNodeStatsReply
22
+ # in node_manager.proto)}
23
+ node_stats = Dict()
24
+ # {node id hex(str): node physical stats(dict from reporter_agent.py)}
25
+ node_physical_stats = Dict()
26
+ # {actor id hex(str): actor table data(dict of ActorTableData
27
+ # in gcs.proto)}
28
+ actors = MutableNotificationDict()
29
+ # {job id hex(str): job table data(dict of JobTableData in gcs.proto)}
30
+ # {node id hex(str): dashboard agent [http port(int), grpc port(int)]}
31
+ agents = Dict()
32
+ # {node id hex(str): gcs node info(dict of GcsNodeInfo in gcs.proto)}
33
+ nodes = Dict()
34
+ # {node id hex(str): worker list}
35
+ node_workers = Dict()
36
+ # {node id hex(str): {actor id hex(str): actor table data}}
37
+ node_actors = MutableNotificationDict()
38
+ # {worker id(str): core worker stats}
39
+ core_worker_stats = Dict()
40
+
41
+
42
+ class DataOrganizer:
43
+ head_node_ip = None
44
+
45
+ @staticmethod
46
+ @async_loop_forever(dashboard_consts.RAY_DASHBOARD_STATS_PURGING_INTERVAL)
47
+ async def purge():
48
+ # Purge data that is out of date.
49
+ # These data sources are maintained by DashboardHead,
50
+ # we do not needs to purge them:
51
+ # * agents
52
+ # * nodes
53
+ alive_nodes = {
54
+ node_id
55
+ for node_id, node_info in DataSource.nodes.items()
56
+ if node_info["state"] == "ALIVE"
57
+ }
58
+ for key in DataSource.node_stats.keys() - alive_nodes:
59
+ DataSource.node_stats.pop(key)
60
+
61
+ for key in DataSource.node_physical_stats.keys() - alive_nodes:
62
+ DataSource.node_physical_stats.pop(key)
63
+
64
+ @classmethod
65
+ @async_loop_forever(dashboard_consts.RAY_DASHBOARD_STATS_UPDATING_INTERVAL)
66
+ async def organize(cls, thread_pool_executor):
67
+ """
68
+ Organizes data: read from (node_physical_stats, node_stats) and updates
69
+ (node_workers, node_worker_stats).
70
+
71
+ This methods is not really async, but DataSource is not thread safe so we need
72
+ to make sure it's on the main event loop thread. To avoid blocking the main
73
+ event loop, we yield after each node processed.
74
+ """
75
+ loop = get_or_create_event_loop()
76
+
77
+ node_workers = {}
78
+ core_worker_stats = {}
79
+
80
+ # NOTE: We copy keys of the `DataSource.nodes` to make sure
81
+ # it doesn't change during the iteration (since its being updated
82
+ # from another async task)
83
+ for node_id in list(DataSource.nodes.keys()):
84
+ node_physical_stats = DataSource.node_physical_stats.get(node_id, {})
85
+ node_stats = DataSource.node_stats.get(node_id, {})
86
+ # Offloads the blocking operation to a thread pool executor. This also
87
+ # yields to the event loop.
88
+ workers = await loop.run_in_executor(
89
+ thread_pool_executor,
90
+ cls._extract_workers_for_node,
91
+ node_physical_stats,
92
+ node_stats,
93
+ )
94
+
95
+ for worker in workers:
96
+ for stats in worker.get("coreWorkerStats", []):
97
+ worker_id = stats["workerId"]
98
+ core_worker_stats[worker_id] = stats
99
+
100
+ node_workers[node_id] = workers
101
+
102
+ DataSource.node_workers.reset(node_workers)
103
+ DataSource.core_worker_stats.reset(core_worker_stats)
104
+
105
+ @classmethod
106
+ def _extract_workers_for_node(cls, node_physical_stats, node_stats):
107
+ workers = []
108
+ # Merge coreWorkerStats (node stats) to workers (node physical stats)
109
+ pid_to_worker_stats = {}
110
+ pid_to_language = {}
111
+ pid_to_job_id = {}
112
+
113
+ for core_worker_stats in node_stats.get("coreWorkersStats", []):
114
+ pid = core_worker_stats["pid"]
115
+
116
+ pid_to_worker_stats[pid] = core_worker_stats
117
+ pid_to_language[pid] = core_worker_stats["language"]
118
+ pid_to_job_id[pid] = core_worker_stats["jobId"]
119
+
120
+ for worker in node_physical_stats.get("workers", []):
121
+ worker = dict(worker)
122
+ pid = worker["pid"]
123
+
124
+ core_worker_stats = pid_to_worker_stats.get(pid)
125
+ # Empty list means core worker stats is not available.
126
+ worker["coreWorkerStats"] = [core_worker_stats] if core_worker_stats else []
127
+ worker["language"] = pid_to_language.get(
128
+ pid, dashboard_consts.DEFAULT_LANGUAGE
129
+ )
130
+ worker["jobId"] = pid_to_job_id.get(pid, dashboard_consts.DEFAULT_JOB_ID)
131
+
132
+ workers.append(worker)
133
+
134
+ return workers
135
+
136
+ @classmethod
137
+ async def get_node_info(cls, node_id, get_summary=False):
138
+ node_physical_stats = dict(DataSource.node_physical_stats.get(node_id, {}))
139
+ node_stats = dict(DataSource.node_stats.get(node_id, {}))
140
+ node = DataSource.nodes.get(node_id, {})
141
+
142
+ if get_summary:
143
+ node_physical_stats.pop("workers", None)
144
+ node_stats.pop("workersStats", None)
145
+ else:
146
+ node_stats.pop("coreWorkersStats", None)
147
+ store_stats = node_stats.get("storeStats", {})
148
+ used = int(store_stats.get("objectStoreBytesUsed", 0))
149
+ # objectStoreBytesAvail == total in the object_manager.cc definition.
150
+ total = int(store_stats.get("objectStoreBytesAvail", 0))
151
+ ray_stats = {
152
+ "object_store_used_memory": used,
153
+ "object_store_available_memory": total - used,
154
+ }
155
+
156
+ node_info = node_physical_stats
157
+ # Merge node stats to node physical stats under raylet
158
+ node_info["raylet"] = node_stats
159
+ node_info["raylet"].update(ray_stats)
160
+
161
+ # Merge GcsNodeInfo to node physical stats
162
+ node_info["raylet"].update(node)
163
+ death_info = node.get("deathInfo", {})
164
+ node_info["raylet"]["stateMessage"] = compose_state_message(
165
+ death_info.get("reason", None), death_info.get("reasonMessage", None)
166
+ )
167
+
168
+ if not get_summary:
169
+ actor_table_entries = DataSource.node_actors.get(node_id, {})
170
+
171
+ # Merge actors to node physical stats
172
+ node_info["actors"] = {
173
+ actor_id: await DataOrganizer._get_actor_info(actor_table_entry)
174
+ for actor_id, actor_table_entry in actor_table_entries.items()
175
+ }
176
+
177
+ # Update workers to node physical stats
178
+ node_info["workers"] = DataSource.node_workers.get(node_id, [])
179
+
180
+ return node_info
181
+
182
+ @classmethod
183
+ async def get_all_node_summary(cls):
184
+ return [
185
+ # NOTE: We're intentionally awaiting in a loop to avoid excessive
186
+ # concurrency spinning up excessive # of tasks for large clusters
187
+ await DataOrganizer.get_node_info(node_id, get_summary=True)
188
+ for node_id in DataSource.nodes.keys()
189
+ ]
190
+
191
+ @classmethod
192
+ async def get_agent_infos(
193
+ cls, target_node_ids: Optional[List[str]] = None
194
+ ) -> Dict[str, Dict[str, Any]]:
195
+ """Fetches running Agent (like HTTP/gRPC ports, IP, etc) running on every node
196
+
197
+ :param target_node_ids: Target node ids to fetch agent info for. If omitted will
198
+ fetch the info for all agents
199
+ """
200
+
201
+ # Return all available agent infos in case no target node-ids were provided
202
+ target_node_ids = target_node_ids or DataSource.agents.keys()
203
+
204
+ missing_node_ids = [
205
+ node_id for node_id in target_node_ids if node_id not in DataSource.agents
206
+ ]
207
+ if missing_node_ids:
208
+ logger.warning(
209
+ f"Agent info was not found for {missing_node_ids}"
210
+ f" (having agent infos for {list(DataSource.agents.keys())})"
211
+ )
212
+ return {}
213
+
214
+ def _create_agent_info(node_id: str):
215
+ (node_ip, http_port, grpc_port) = DataSource.agents[node_id]
216
+
217
+ return dict(
218
+ ipAddress=node_ip,
219
+ httpPort=int(http_port or -1),
220
+ grpcPort=int(grpc_port or -1),
221
+ httpAddress=f"{node_ip}:{http_port}",
222
+ )
223
+
224
+ return {node_id: _create_agent_info(node_id) for node_id in target_node_ids}
225
+
226
+ @classmethod
227
+ async def get_actor_infos(cls, actor_ids: Optional[List[str]] = None):
228
+ target_actor_table_entries: dict[str, Optional[dict]]
229
+ if actor_ids is not None:
230
+ target_actor_table_entries = {
231
+ actor_id: DataSource.actors.get(actor_id) for actor_id in actor_ids
232
+ }
233
+ else:
234
+ target_actor_table_entries = DataSource.actors
235
+
236
+ return {
237
+ actor_id: await DataOrganizer._get_actor_info(actor_table_entry)
238
+ for actor_id, actor_table_entry in target_actor_table_entries.items()
239
+ }
240
+
241
+ @staticmethod
242
+ async def _get_actor_info(actor):
243
+ if actor is None:
244
+ return None
245
+
246
+ actor = dict(actor)
247
+ worker_id = actor["address"]["workerId"]
248
+ core_worker_stats = DataSource.core_worker_stats.get(worker_id, {})
249
+ actor_constructor = core_worker_stats.get(
250
+ "actorTitle", "Unknown actor constructor"
251
+ )
252
+ actor["actorConstructor"] = actor_constructor
253
+ actor.update(core_worker_stats)
254
+
255
+ # TODO(fyrestone): remove this, give a link from actor
256
+ # info to worker info in front-end.
257
+ node_id = actor["address"]["rayletId"]
258
+ pid = core_worker_stats.get("pid")
259
+ node_physical_stats = DataSource.node_physical_stats.get(node_id, {})
260
+ actor_process_stats = None
261
+ actor_process_gpu_stats = []
262
+ if pid:
263
+ for process_stats in node_physical_stats.get("workers", []):
264
+ if process_stats["pid"] == pid:
265
+ actor_process_stats = process_stats
266
+ break
267
+
268
+ for gpu_stats in node_physical_stats.get("gpus", []):
269
+ # gpu_stats.get("processes") can be None, an empty list or a
270
+ # list of dictionaries.
271
+ for process in gpu_stats.get("processesPids") or []:
272
+ if process["pid"] == pid:
273
+ actor_process_gpu_stats.append(gpu_stats)
274
+ break
275
+
276
+ actor["gpus"] = actor_process_gpu_stats
277
+ actor["processStats"] = actor_process_stats
278
+ actor["mem"] = node_physical_stats.get("mem", [])
279
+
280
+ required_resources = parse_pg_formatted_resources_to_original(
281
+ actor["requiredResources"]
282
+ )
283
+ actor["requiredResources"] = required_resources
284
+
285
+ return actor
.venv/lib/python3.11/site-packages/ray/dashboard/head.py ADDED
@@ -0,0 +1,351 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import logging
3
+ from concurrent.futures import ThreadPoolExecutor
4
+ from pathlib import Path
5
+ from typing import Optional, Set
6
+
7
+ import ray.dashboard.consts as dashboard_consts
8
+ import ray.dashboard.utils as dashboard_utils
9
+ import ray.experimental.internal_kv as internal_kv
10
+ from ray._private import ray_constants
11
+ from ray._private.gcs_utils import GcsAioClient
12
+ from ray._private.ray_constants import env_integer
13
+ from ray._private.usage.usage_lib import TagKey, record_extra_usage_tag
14
+ from ray._raylet import GcsClient
15
+ from ray.dashboard.consts import DASHBOARD_METRIC_PORT
16
+ from ray.dashboard.dashboard_metrics import DashboardPrometheusMetrics
17
+ from ray.dashboard.datacenter import DataOrganizer
18
+ from ray.dashboard.utils import (
19
+ DashboardHeadModule,
20
+ DashboardHeadModuleConfig,
21
+ async_loop_forever,
22
+ )
23
+
24
+ try:
25
+ import prometheus_client
26
+ except ImportError:
27
+ prometheus_client = None
28
+
29
+
30
+ logger = logging.getLogger(__name__)
31
+
32
+ GRPC_CHANNEL_OPTIONS = (
33
+ *ray_constants.GLOBAL_GRPC_OPTIONS,
34
+ ("grpc.max_send_message_length", ray_constants.GRPC_CPP_MAX_MESSAGE_SIZE),
35
+ ("grpc.max_receive_message_length", ray_constants.GRPC_CPP_MAX_MESSAGE_SIZE),
36
+ )
37
+
38
+ # NOTE: Executor in this head is intentionally constrained to just 1 thread by
39
+ # default to limit its concurrency, therefore reducing potential for
40
+ # GIL contention
41
+ RAY_DASHBOARD_DASHBOARD_HEAD_TPE_MAX_WORKERS = env_integer(
42
+ "RAY_DASHBOARD_DASHBOARD_HEAD_TPE_MAX_WORKERS", 1
43
+ )
44
+
45
+
46
+ def initialize_grpc_port_and_server(grpc_ip, grpc_port):
47
+ try:
48
+ from grpc import aio as aiogrpc
49
+ except ImportError:
50
+ from grpc.experimental import aio as aiogrpc
51
+
52
+ import ray._private.tls_utils
53
+
54
+ aiogrpc.init_grpc_aio()
55
+
56
+ server = aiogrpc.server(options=(("grpc.so_reuseport", 0),))
57
+
58
+ grpc_port = ray._private.tls_utils.add_port_to_grpc_server(
59
+ server, f"{grpc_ip}:{grpc_port}"
60
+ )
61
+
62
+ return server, grpc_port
63
+
64
+
65
+ class DashboardHead:
66
+ def __init__(
67
+ self,
68
+ http_host: str,
69
+ http_port: int,
70
+ http_port_retries: int,
71
+ gcs_address: str,
72
+ cluster_id_hex: str,
73
+ node_ip_address: str,
74
+ grpc_port: int,
75
+ log_dir: str,
76
+ temp_dir: str,
77
+ session_dir: str,
78
+ minimal: bool,
79
+ serve_frontend: bool,
80
+ modules_to_load: Optional[Set[str]] = None,
81
+ ):
82
+ """
83
+ Args:
84
+ http_host: The host address for the Http server.
85
+ http_port: The port for the Http server.
86
+ http_port_retries: The maximum retry to bind ports for the Http server.
87
+ gcs_address: The GCS address in the {address}:{port} format.
88
+ log_dir: The log directory. E.g., /tmp/session_latest/logs.
89
+ temp_dir: The temp directory. E.g., /tmp.
90
+ session_dir: The session directory. E.g., tmp/session_latest.
91
+ minimal: Whether or not it will load the minimal modules.
92
+ serve_frontend: If configured, frontend HTML is
93
+ served from the dashboard.
94
+ grpc_port: The port used to listen for gRPC on.
95
+ modules_to_load: A set of module name in string to load.
96
+ By default (None), it loads all available modules.
97
+ Note that available modules could be changed depending on
98
+ minimal flags.
99
+ """
100
+ self.minimal = minimal
101
+ self.serve_frontend = serve_frontend
102
+ # If it is the minimal mode, we shouldn't serve frontend.
103
+ if self.minimal:
104
+ self.serve_frontend = False
105
+ # Public attributes are accessible for all head modules.
106
+ # Walkaround for issue: https://github.com/ray-project/ray/issues/7084
107
+ self.http_host = "127.0.0.1" if http_host == "localhost" else http_host
108
+ self.http_port = http_port
109
+ self.http_port_retries = http_port_retries
110
+ self._modules_to_load = modules_to_load
111
+ self._modules_loaded = False
112
+ self.metrics = None
113
+
114
+ self._executor = ThreadPoolExecutor(
115
+ max_workers=RAY_DASHBOARD_DASHBOARD_HEAD_TPE_MAX_WORKERS,
116
+ thread_name_prefix="dashboard_head_executor",
117
+ )
118
+
119
+ assert gcs_address is not None
120
+ self.gcs_address = gcs_address
121
+ self.cluster_id_hex = cluster_id_hex
122
+ self.log_dir = log_dir
123
+ self.temp_dir = temp_dir
124
+ self.session_dir = session_dir
125
+ self.session_name = Path(session_dir).name
126
+ self.gcs_error_subscriber = None
127
+ self.gcs_log_subscriber = None
128
+ self.ip = node_ip_address
129
+ DataOrganizer.head_node_ip = self.ip
130
+
131
+ if self.minimal:
132
+ self.server, self.grpc_port = None, None
133
+ else:
134
+ grpc_ip = "127.0.0.1" if self.ip == "127.0.0.1" else "0.0.0.0"
135
+ self.server, self.grpc_port = initialize_grpc_port_and_server(
136
+ grpc_ip, grpc_port
137
+ )
138
+ logger.info("Dashboard head grpc address: %s:%s", grpc_ip, self.grpc_port)
139
+ # If the dashboard is started as non-minimal version, http server should
140
+ # be configured to expose APIs.
141
+ self.http_server = None
142
+
143
+ async def _configure_http_server(self, modules):
144
+ from ray.dashboard.http_server_head import HttpServerDashboardHead
145
+
146
+ self.http_server = HttpServerDashboardHead(
147
+ self.ip,
148
+ self.http_host,
149
+ self.http_port,
150
+ self.http_port_retries,
151
+ self.gcs_address,
152
+ self.session_name,
153
+ self.metrics,
154
+ )
155
+ await self.http_server.run(modules)
156
+
157
+ @property
158
+ def http_session(self):
159
+ if not self._modules_loaded and not self.http_server:
160
+ # When the dashboard is still starting up, this property gets
161
+ # called as part of the method_route_table_factory magic. In
162
+ # this case, the property is not actually used but the magic
163
+ # method calls every property to look for a route to add to
164
+ # the global route table. It should be okay for http_server
165
+ # to still be None at this point.
166
+ return None
167
+ assert self.http_server, "Accessing unsupported API in a minimal ray."
168
+ return self.http_server.http_session
169
+
170
+ @async_loop_forever(dashboard_consts.GCS_CHECK_ALIVE_INTERVAL_SECONDS)
171
+ async def _gcs_check_alive(self):
172
+ try:
173
+ # If gcs is permanently dead, gcs client will exit the process
174
+ # (see gcs_rpc_client.h)
175
+ await self.gcs_aio_client.check_alive(node_ips=[], timeout=None)
176
+ except Exception:
177
+ logger.warning("Failed to check gcs aliveness, will retry", exc_info=True)
178
+
179
+ def _load_modules(self, modules_to_load: Optional[Set[str]] = None):
180
+ """Load dashboard head modules.
181
+
182
+ Args:
183
+ modules: A list of module names to load. By default (None),
184
+ it loads all modules.
185
+ """
186
+ modules = []
187
+ head_cls_list = dashboard_utils.get_all_modules(DashboardHeadModule)
188
+
189
+ config = DashboardHeadModuleConfig(
190
+ minimal=self.minimal,
191
+ cluster_id_hex=self.cluster_id_hex,
192
+ session_name=self.session_name,
193
+ gcs_address=self.gcs_address,
194
+ log_dir=self.log_dir,
195
+ temp_dir=self.temp_dir,
196
+ session_dir=self.session_dir,
197
+ ip=self.ip,
198
+ http_host=self.http_host,
199
+ http_port=self.http_port,
200
+ metrics=self.metrics,
201
+ )
202
+
203
+ # Select modules to load.
204
+ modules_to_load = modules_to_load or {m.__name__ for m in head_cls_list}
205
+ logger.info("Modules to load: %s", modules_to_load)
206
+
207
+ for cls in head_cls_list:
208
+ logger.info("Loading %s: %s", DashboardHeadModule.__name__, cls)
209
+ if cls.__name__ in modules_to_load:
210
+ c = cls(config)
211
+ modules.append(c)
212
+
213
+ # Verify modules are loaded as expected.
214
+ loaded_modules = {type(m).__name__ for m in modules}
215
+ if loaded_modules != modules_to_load:
216
+ assert False, (
217
+ "Actual loaded modules, {}, doesn't match the requested modules "
218
+ "to load, {}".format(loaded_modules, modules_to_load)
219
+ )
220
+
221
+ self._modules_loaded = True
222
+ logger.info("Loaded %d modules. %s", len(modules), modules)
223
+ return modules
224
+
225
+ async def _setup_metrics(self, gcs_aio_client):
226
+ metrics = DashboardPrometheusMetrics()
227
+
228
+ # Setup prometheus metrics export server
229
+ assert internal_kv._internal_kv_initialized()
230
+ assert gcs_aio_client is not None
231
+ address = f"{self.ip}:{DASHBOARD_METRIC_PORT}"
232
+ await gcs_aio_client.internal_kv_put(
233
+ "DashboardMetricsAddress".encode(), address.encode(), True, namespace=None
234
+ )
235
+ if prometheus_client:
236
+ try:
237
+ logger.info(
238
+ "Starting dashboard metrics server on port {}".format(
239
+ DASHBOARD_METRIC_PORT
240
+ )
241
+ )
242
+ kwargs = {"addr": "127.0.0.1"} if self.ip == "127.0.0.1" else {}
243
+ prometheus_client.start_http_server(
244
+ port=DASHBOARD_METRIC_PORT,
245
+ registry=metrics.registry,
246
+ **kwargs,
247
+ )
248
+ except Exception:
249
+ logger.exception(
250
+ "An exception occurred while starting the metrics server."
251
+ )
252
+ elif not prometheus_client:
253
+ logger.warning(
254
+ "`prometheus_client` not found, so metrics will not be exported."
255
+ )
256
+
257
+ return metrics
258
+
259
+ async def run(self):
260
+ gcs_address = self.gcs_address
261
+
262
+ # Dashboard will handle connection failure automatically
263
+ self.gcs_client = GcsClient(
264
+ address=gcs_address, nums_reconnect_retry=0, cluster_id=self.cluster_id_hex
265
+ )
266
+ self.gcs_aio_client = GcsAioClient(
267
+ address=gcs_address, nums_reconnect_retry=0, cluster_id=self.cluster_id_hex
268
+ )
269
+ internal_kv._initialize_internal_kv(self.gcs_client)
270
+
271
+ if not self.minimal:
272
+ self.metrics = await self._setup_metrics(self.gcs_aio_client)
273
+
274
+ try:
275
+ assert internal_kv._internal_kv_initialized()
276
+ # Note: We always record the usage, but it is not reported
277
+ # if the usage stats is disabled.
278
+ record_extra_usage_tag(TagKey.DASHBOARD_USED, "False")
279
+ except Exception as e:
280
+ logger.warning(
281
+ "Failed to record the dashboard usage. "
282
+ "This error message is harmless and can be ignored. "
283
+ f"Error: {e}"
284
+ )
285
+
286
+ # Start a grpc asyncio server.
287
+ if self.server:
288
+ await self.server.start()
289
+
290
+ async def _async_notify():
291
+ """Notify signals from queue."""
292
+ while True:
293
+ co = await dashboard_utils.NotifyQueue.get()
294
+ try:
295
+ await co
296
+ except Exception:
297
+ logger.exception(f"Error notifying coroutine {co}")
298
+
299
+ modules = self._load_modules(self._modules_to_load)
300
+
301
+ http_host, http_port = self.http_host, self.http_port
302
+ if self.serve_frontend:
303
+ logger.info("Initialize the http server.")
304
+ await self._configure_http_server(modules)
305
+ http_host, http_port = self.http_server.get_address()
306
+ logger.info(f"http server initialized at {http_host}:{http_port}")
307
+ else:
308
+ logger.info("http server disabled.")
309
+
310
+ # We need to expose dashboard's node's ip for other worker nodes
311
+ # if it's listening to all interfaces.
312
+ dashboard_http_host = (
313
+ self.ip
314
+ if self.http_host != ray_constants.DEFAULT_DASHBOARD_IP
315
+ else http_host
316
+ )
317
+ # This synchronous code inside an async context is not great.
318
+ # It is however acceptable, because this only gets run once
319
+ # during initialization and therefore cannot block the event loop.
320
+ # This could be done better in the future, including
321
+ # removing the polling on the Ray side, by communicating the
322
+ # server address to Ray via stdin / stdout or a pipe.
323
+ self.gcs_client.internal_kv_put(
324
+ ray_constants.DASHBOARD_ADDRESS.encode(),
325
+ f"{dashboard_http_host}:{http_port}".encode(),
326
+ True,
327
+ namespace=ray_constants.KV_NAMESPACE_DASHBOARD,
328
+ )
329
+ self.gcs_client.internal_kv_put(
330
+ dashboard_consts.DASHBOARD_RPC_ADDRESS.encode(),
331
+ f"{self.ip}:{self.grpc_port}".encode(),
332
+ True,
333
+ namespace=ray_constants.KV_NAMESPACE_DASHBOARD,
334
+ )
335
+
336
+ # Freeze signal after all modules loaded.
337
+ dashboard_utils.SignalManager.freeze()
338
+ concurrent_tasks = [
339
+ self._gcs_check_alive(),
340
+ _async_notify(),
341
+ DataOrganizer.purge(),
342
+ DataOrganizer.organize(self._executor),
343
+ ]
344
+ for m in modules:
345
+ concurrent_tasks.append(m.run(self.server))
346
+ if self.server:
347
+ concurrent_tasks.append(self.server.wait_for_termination())
348
+ await asyncio.gather(*concurrent_tasks)
349
+
350
+ if self.http_server:
351
+ await self.http_server.cleanup()
.venv/lib/python3.11/site-packages/ray/dashboard/http_server_agent.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+
3
+ from packaging.version import Version
4
+
5
+ import ray.dashboard.optional_utils as dashboard_optional_utils
6
+ from ray._private.utils import get_or_create_event_loop
7
+ from ray.dashboard.optional_deps import aiohttp, aiohttp_cors, hdrs
8
+
9
+ logger = logging.getLogger(__name__)
10
+ routes = dashboard_optional_utils.DashboardAgentRouteTable
11
+
12
+
13
+ class HttpServerAgent:
14
+ def __init__(self, ip, listen_port):
15
+ self.ip = ip
16
+ self.listen_port = listen_port
17
+ self.http_host = None
18
+ self.http_port = None
19
+ self.http_session = None
20
+ self.runner = None
21
+
22
+ async def start(self, modules):
23
+ # Create a http session for all modules.
24
+ # aiohttp<4.0.0 uses a 'loop' variable, aiohttp>=4.0.0 doesn't anymore
25
+ if Version(aiohttp.__version__) < Version("4.0.0"):
26
+ self.http_session = aiohttp.ClientSession(loop=get_or_create_event_loop())
27
+ else:
28
+ self.http_session = aiohttp.ClientSession()
29
+
30
+ # Bind routes for every module so that each module
31
+ # can use decorator-style routes.
32
+ for c in modules:
33
+ dashboard_optional_utils.DashboardAgentRouteTable.bind(c)
34
+
35
+ app = aiohttp.web.Application()
36
+ app.add_routes(routes=routes.bound_routes())
37
+
38
+ # Enable CORS on all routes.
39
+ cors = aiohttp_cors.setup(
40
+ app,
41
+ defaults={
42
+ "*": aiohttp_cors.ResourceOptions(
43
+ allow_credentials=True,
44
+ expose_headers="*",
45
+ allow_methods="*",
46
+ allow_headers=("Content-Type", "X-Header"),
47
+ )
48
+ },
49
+ )
50
+ for route in list(app.router.routes()):
51
+ cors.add(route)
52
+
53
+ self.runner = aiohttp.web.AppRunner(app)
54
+ await self.runner.setup()
55
+ try:
56
+ site = aiohttp.web.TCPSite(
57
+ self.runner,
58
+ "127.0.0.1" if self.ip == "127.0.0.1" else "0.0.0.0",
59
+ self.listen_port,
60
+ )
61
+ await site.start()
62
+ except OSError as e:
63
+ logger.error(
64
+ f"Agent port #{self.listen_port} already in use. "
65
+ "Failed to start agent. "
66
+ f"Ensure port #{self.listen_port} is available, and then try again."
67
+ )
68
+ raise e
69
+ self.http_host, self.http_port, *_ = site._server.sockets[0].getsockname()
70
+ logger.info(
71
+ "Dashboard agent http address: %s:%s", self.http_host, self.http_port
72
+ )
73
+
74
+ # Dump registered http routes.
75
+ dump_routes = [r for r in app.router.routes() if r.method != hdrs.METH_HEAD]
76
+ for r in dump_routes:
77
+ logger.info(r)
78
+ logger.info("Registered %s routes.", len(dump_routes))
79
+
80
+ async def cleanup(self):
81
+ # Wait for finish signal.
82
+ await self.runner.cleanup()
83
+ await self.http_session.close()
.venv/lib/python3.11/site-packages/ray/dashboard/http_server_head.py ADDED
@@ -0,0 +1,289 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import errno
3
+ import ipaddress
4
+ import logging
5
+ import os
6
+ import pathlib
7
+ import sys
8
+ import time
9
+ from math import floor
10
+
11
+ from packaging.version import Version
12
+
13
+ import ray
14
+ import ray.dashboard.optional_utils as dashboard_optional_utils
15
+ import ray.dashboard.timezone_utils as timezone_utils
16
+ import ray.dashboard.utils as dashboard_utils
17
+ from ray._private.usage.usage_lib import TagKey, record_extra_usage_tag
18
+ from ray._private.utils import get_or_create_event_loop
19
+ from ray.dashboard.dashboard_metrics import DashboardPrometheusMetrics
20
+
21
+ # All third-party dependencies that are not included in the minimal Ray
22
+ # installation must be included in this file. This allows us to determine if
23
+ # the agent has the necessary dependencies to be started.
24
+ from ray.dashboard.optional_deps import aiohttp, hdrs
25
+
26
+ # Logger for this module. It should be configured at the entry point
27
+ # into the program using Ray. Ray provides a default configuration at
28
+ # entry/init points.
29
+ logger = logging.getLogger(__name__)
30
+ routes = dashboard_optional_utils.DashboardHeadRouteTable
31
+
32
+ # Env var that enables follow_symlinks for serving UI static files.
33
+ # This is an advanced setting that should only be used with special Ray installations
34
+ # where the dashboard build files are symlinked to a different directory.
35
+ # This is not recommended for most users and can pose a security risk.
36
+ # Please reference the aiohttp docs here:
37
+ # https://docs.aiohttp.org/en/stable/web_reference.html#aiohttp.web.UrlDispatcher.add_static
38
+ ENV_VAR_FOLLOW_SYMLINKS = "RAY_DASHBOARD_BUILD_FOLLOW_SYMLINKS"
39
+ FOLLOW_SYMLINKS_ENABLED = os.environ.get(ENV_VAR_FOLLOW_SYMLINKS) == "1"
40
+ if FOLLOW_SYMLINKS_ENABLED:
41
+ logger.warning(
42
+ "Enabling RAY_DASHBOARD_BUILD_FOLLOW_SYMLINKS is not recommended as it "
43
+ "allows symlinks to directories outside the dashboard build folder. "
44
+ "You may accidentally expose files on your system outside of the "
45
+ "build directory."
46
+ )
47
+
48
+
49
+ def setup_static_dir():
50
+ build_dir = os.path.join(
51
+ os.path.dirname(os.path.abspath(__file__)), "client", "build"
52
+ )
53
+ module_name = os.path.basename(os.path.dirname(__file__))
54
+ if not os.path.isdir(build_dir):
55
+ raise dashboard_utils.FrontendNotFoundError(
56
+ errno.ENOENT,
57
+ "Dashboard build directory not found. If installing "
58
+ "from source, please follow the additional steps "
59
+ "required to build the dashboard"
60
+ f"(cd python/ray/{module_name}/client "
61
+ "&& npm ci "
62
+ "&& npm run build)",
63
+ build_dir,
64
+ )
65
+
66
+ static_dir = os.path.join(build_dir, "static")
67
+ routes.static("/static", static_dir, follow_symlinks=FOLLOW_SYMLINKS_ENABLED)
68
+ return build_dir
69
+
70
+
71
+ class HttpServerDashboardHead:
72
+ def __init__(
73
+ self,
74
+ ip: str,
75
+ http_host: str,
76
+ http_port: int,
77
+ http_port_retries: int,
78
+ gcs_address: str,
79
+ session_name: str,
80
+ metrics: DashboardPrometheusMetrics,
81
+ ):
82
+ self.ip = ip
83
+ self.http_host = http_host
84
+ self.http_port = http_port
85
+ self.http_port_retries = http_port_retries
86
+ self.head_node_ip = gcs_address.split(":")[0]
87
+ self.metrics = metrics
88
+ self._session_name = session_name
89
+
90
+ # Below attirubtes are filled after `run` API is invoked.
91
+ self.runner = None
92
+
93
+ # Setup Dashboard Routes
94
+ try:
95
+ build_dir = setup_static_dir()
96
+ logger.info("Setup static dir for dashboard: %s", build_dir)
97
+ except dashboard_utils.FrontendNotFoundError as ex:
98
+ # Not to raise FrontendNotFoundError due to NPM incompatibilities
99
+ # with Windows.
100
+ # Please refer to ci.sh::build_dashboard_front_end()
101
+ if sys.platform in ["win32", "cygwin"]:
102
+ logger.warning(ex)
103
+ else:
104
+ raise ex
105
+ dashboard_optional_utils.DashboardHeadRouteTable.bind(self)
106
+
107
+ # Create a http session for all modules.
108
+ # aiohttp<4.0.0 uses a 'loop' variable, aiohttp>=4.0.0 doesn't anymore
109
+ if Version(aiohttp.__version__) < Version("4.0.0"):
110
+ self.http_session = aiohttp.ClientSession(loop=get_or_create_event_loop())
111
+ else:
112
+ self.http_session = aiohttp.ClientSession()
113
+
114
+ @routes.get("/")
115
+ async def get_index(self, req) -> aiohttp.web.FileResponse:
116
+ try:
117
+ # This API will be no-op after the first report.
118
+ # Note: We always record the usage, but it is not reported
119
+ # if the usage stats is disabled.
120
+ record_extra_usage_tag(TagKey.DASHBOARD_USED, "True")
121
+ except Exception as e:
122
+ logger.warning(
123
+ "Failed to record the dashboard usage. "
124
+ "This error message is harmless and can be ignored. "
125
+ f"Error: {e}"
126
+ )
127
+ resp = aiohttp.web.FileResponse(
128
+ os.path.join(
129
+ os.path.dirname(os.path.abspath(__file__)), "client/build/index.html"
130
+ )
131
+ )
132
+ resp.headers["Cache-Control"] = "no-cache"
133
+ return resp
134
+
135
+ @routes.get("/favicon.ico")
136
+ async def get_favicon(self, req) -> aiohttp.web.FileResponse:
137
+ return aiohttp.web.FileResponse(
138
+ os.path.join(
139
+ os.path.dirname(os.path.abspath(__file__)), "client/build/favicon.ico"
140
+ )
141
+ )
142
+
143
+ @routes.get("/timezone")
144
+ async def get_timezone(self, req) -> aiohttp.web.Response:
145
+ try:
146
+ current_timezone = timezone_utils.get_current_timezone_info()
147
+ return aiohttp.web.json_response(current_timezone)
148
+
149
+ except Exception as e:
150
+ logger.error(f"Error getting timezone: {e}")
151
+ return aiohttp.web.Response(
152
+ status=500, text="Internal Server Error:" + str(e)
153
+ )
154
+
155
+ def get_address(self):
156
+ assert self.http_host and self.http_port
157
+ return self.http_host, self.http_port
158
+
159
+ @aiohttp.web.middleware
160
+ async def path_clean_middleware(self, request, handler):
161
+ if request.path.startswith("/static") or request.path.startswith("/logs"):
162
+ parent = pathlib.PurePosixPath(
163
+ "/logs" if request.path.startswith("/logs") else "/static"
164
+ )
165
+
166
+ # If the destination is not relative to the expected directory,
167
+ # then the user is attempting path traversal, so deny the request.
168
+ request_path = pathlib.PurePosixPath(
169
+ pathlib.posixpath.realpath(request.path)
170
+ )
171
+ if request_path != parent and parent not in request_path.parents:
172
+ logger.info(
173
+ f"Rejecting {request_path=} because it is not relative to {parent=}"
174
+ )
175
+ raise aiohttp.web.HTTPForbidden()
176
+ return await handler(request)
177
+
178
+ @aiohttp.web.middleware
179
+ async def browsers_no_post_put_middleware(self, request, handler):
180
+ if (
181
+ # A best effort test for browser traffic. All common browsers
182
+ # start with Mozilla at the time of writing.
183
+ dashboard_optional_utils.is_browser_request(request)
184
+ and request.method in [hdrs.METH_POST, hdrs.METH_PUT]
185
+ ):
186
+ return aiohttp.web.Response(
187
+ status=405, text="Method Not Allowed for browser traffic."
188
+ )
189
+
190
+ return await handler(request)
191
+
192
+ @aiohttp.web.middleware
193
+ async def metrics_middleware(self, request, handler):
194
+ start_time = time.monotonic()
195
+
196
+ try:
197
+ response = await handler(request)
198
+ status_tag = f"{floor(response.status / 100)}xx"
199
+ return response
200
+ except (Exception, asyncio.CancelledError):
201
+ status_tag = "5xx"
202
+ raise
203
+ finally:
204
+ resp_time = time.monotonic() - start_time
205
+ try:
206
+ self.metrics.metrics_request_duration.labels(
207
+ endpoint=handler.__name__,
208
+ http_status=status_tag,
209
+ Version=ray.__version__,
210
+ SessionName=self._session_name,
211
+ Component="dashboard",
212
+ ).observe(resp_time)
213
+ self.metrics.metrics_request_count.labels(
214
+ method=request.method,
215
+ endpoint=handler.__name__,
216
+ http_status=status_tag,
217
+ Version=ray.__version__,
218
+ SessionName=self._session_name,
219
+ Component="dashboard",
220
+ ).inc()
221
+ except Exception as e:
222
+ logger.exception(f"Error emitting api metrics: {e}")
223
+
224
+ @aiohttp.web.middleware
225
+ async def cache_control_static_middleware(self, request, handler):
226
+ if request.path.startswith("/static"):
227
+ response = await handler(request)
228
+ response.headers["Cache-Control"] = "max-age=31536000"
229
+ return response
230
+ return await handler(request)
231
+
232
+ async def run(self, modules):
233
+ # Bind http routes of each module.
234
+ for c in modules:
235
+ dashboard_optional_utils.DashboardHeadRouteTable.bind(c)
236
+
237
+ # Http server should be initialized after all modules loaded.
238
+ # working_dir uploads for job submission can be up to 100MiB.
239
+ app = aiohttp.web.Application(
240
+ client_max_size=100 * 1024**2,
241
+ middlewares=[
242
+ self.metrics_middleware,
243
+ self.path_clean_middleware,
244
+ self.browsers_no_post_put_middleware,
245
+ self.cache_control_static_middleware,
246
+ ],
247
+ )
248
+ app.add_routes(routes=routes.bound_routes())
249
+
250
+ self.runner = aiohttp.web.AppRunner(
251
+ app,
252
+ access_log_format=(
253
+ "%a %t '%r' %s %b bytes %D us " "'%{Referer}i' '%{User-Agent}i'"
254
+ ),
255
+ )
256
+ await self.runner.setup()
257
+ last_ex = None
258
+ for i in range(1 + self.http_port_retries):
259
+ try:
260
+ site = aiohttp.web.TCPSite(self.runner, self.http_host, self.http_port)
261
+ await site.start()
262
+ break
263
+ except OSError as e:
264
+ last_ex = e
265
+ self.http_port += 1
266
+ logger.warning("Try to use port %s: %s", self.http_port, e)
267
+ else:
268
+ raise Exception(
269
+ f"Failed to find a valid port for dashboard after "
270
+ f"{self.http_port_retries} retries: {last_ex}"
271
+ )
272
+ self.http_host, self.http_port, *_ = site._server.sockets[0].getsockname()
273
+ self.http_host = (
274
+ self.ip
275
+ if ipaddress.ip_address(self.http_host).is_unspecified
276
+ else self.http_host
277
+ )
278
+ logger.info(
279
+ "Dashboard head http address: %s:%s", self.http_host, self.http_port
280
+ )
281
+ # Dump registered http routes.
282
+ dump_routes = [r for r in app.router.routes() if r.method != hdrs.METH_HEAD]
283
+ for r in dump_routes:
284
+ logger.info(r)
285
+ logger.info("Registered %s routes.", len(dump_routes))
286
+
287
+ async def cleanup(self):
288
+ # Wait for finish signal.
289
+ await self.runner.cleanup()
.venv/lib/python3.11/site-packages/ray/dashboard/k8s_utils.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+
3
+ from ray._private.utils import get_num_cpus
4
+
5
+ logger = logging.getLogger(__name__)
6
+
7
+ CPU_USAGE_PATH = "/sys/fs/cgroup/cpuacct/cpuacct.usage"
8
+ CPU_USAGE_PATH_V2 = "/sys/fs/cgroup/cpu.stat"
9
+ PROC_STAT_PATH = "/proc/stat"
10
+
11
+ container_num_cpus = None
12
+ host_num_cpus = None
13
+
14
+ last_cpu_usage = None
15
+ last_system_usage = None
16
+
17
+
18
+ def cpu_percent():
19
+ """Estimate CPU usage percent for Ray pod managed by Kubernetes
20
+ Operator.
21
+
22
+ Computed by the following steps
23
+ (1) Replicate the logic used by 'docker stats' cli command.
24
+ See https://github.com/docker/cli/blob/c0a6b1c7b30203fbc28cd619acb901a95a80e30e/cli/command/container/stats_helpers.go#L166.
25
+ (2) Divide by the number of CPUs available to the container, so that
26
+ e.g. full capacity use of 2 CPUs will read as 100%,
27
+ rather than 200%.
28
+
29
+ Step (1) above works by
30
+ dividing delta in cpu usage by
31
+ delta in total host cpu usage, averaged over host's cpus.
32
+
33
+ Since deltas are not initially available, return 0.0 on first call.
34
+ """ # noqa
35
+ global last_system_usage
36
+ global last_cpu_usage
37
+ try:
38
+ cpu_usage = _cpu_usage()
39
+ system_usage = _system_usage()
40
+ # Return 0.0 on first call.
41
+ if last_system_usage is None:
42
+ cpu_percent = 0.0
43
+ else:
44
+ cpu_delta = cpu_usage - last_cpu_usage
45
+ # "System time passed." (Typically close to clock time.)
46
+ system_delta = (system_usage - last_system_usage) / _host_num_cpus()
47
+
48
+ quotient = cpu_delta / system_delta
49
+ cpu_percent = round(quotient * 100 / get_num_cpus(), 1)
50
+ last_system_usage = system_usage
51
+ last_cpu_usage = cpu_usage
52
+ # Computed percentage might be slightly above 100%.
53
+ return min(cpu_percent, 100.0)
54
+ except Exception:
55
+ logger.exception("Error computing CPU usage of Ray Kubernetes pod.")
56
+ return 0.0
57
+
58
+
59
+ def _cpu_usage():
60
+ """Compute total cpu usage of the container in nanoseconds
61
+ by reading from cpuacct in cgroups v1 or cpu.stat in cgroups v2."""
62
+ try:
63
+ # cgroups v1
64
+ return int(open(CPU_USAGE_PATH).read())
65
+ except FileNotFoundError:
66
+ # cgroups v2
67
+ cpu_stat_text = open(CPU_USAGE_PATH_V2).read()
68
+ # e.g. "usage_usec 16089294616"
69
+ cpu_stat_first_line = cpu_stat_text.split("\n")[0]
70
+ # get the second word of the first line, cast as an integer
71
+ # this is the CPU usage is microseconds
72
+ cpu_usec = int(cpu_stat_first_line.split()[1])
73
+ # Convert to nanoseconds and return.
74
+ return cpu_usec * 1000
75
+
76
+
77
+ def _system_usage():
78
+ """
79
+ Computes total CPU usage of the host in nanoseconds.
80
+
81
+ Logic taken from here:
82
+ https://github.com/moby/moby/blob/b42ac8d370a8ef8ec720dff0ca9dfb3530ac0a6a/daemon/stats/collector_unix.go#L31
83
+
84
+ See also the /proc/stat entry here:
85
+ https://man7.org/linux/man-pages/man5/proc.5.html
86
+ """ # noqa
87
+ cpu_summary_str = open(PROC_STAT_PATH).read().split("\n")[0]
88
+ parts = cpu_summary_str.split()
89
+ assert parts[0] == "cpu"
90
+ usage_data = parts[1:8]
91
+ total_clock_ticks = sum(int(entry) for entry in usage_data)
92
+ # 100 clock ticks per second, 10^9 ns per second
93
+ usage_ns = total_clock_ticks * 10**7
94
+ return usage_ns
95
+
96
+
97
+ def _host_num_cpus():
98
+ """Number of physical CPUs, obtained by parsing /proc/stat."""
99
+ global host_num_cpus
100
+ if host_num_cpus is None:
101
+ proc_stat_lines = open(PROC_STAT_PATH).read().split("\n")
102
+ split_proc_stat_lines = [line.split() for line in proc_stat_lines]
103
+ cpu_lines = [
104
+ split_line
105
+ for split_line in split_proc_stat_lines
106
+ if len(split_line) > 0 and "cpu" in split_line[0]
107
+ ]
108
+ # Number of lines starting with a word including 'cpu', subtracting
109
+ # 1 for the first summary line.
110
+ host_num_cpus = len(cpu_lines) - 1
111
+ return host_num_cpus
.venv/lib/python3.11/site-packages/ray/dashboard/memory_utils.py ADDED
@@ -0,0 +1,524 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import logging
3
+ from collections import defaultdict
4
+ from enum import Enum
5
+ from typing import List
6
+
7
+ import ray
8
+ from ray._private.internal_api import node_stats
9
+ from ray._raylet import ActorID, JobID, TaskID
10
+
11
+ logger = logging.getLogger(__name__)
12
+
13
+ # These values are used to calculate if objectRefs are actor handles.
14
+ TASKID_BYTES_SIZE = TaskID.size()
15
+ ACTORID_BYTES_SIZE = ActorID.size()
16
+ JOBID_BYTES_SIZE = JobID.size()
17
+
18
+
19
+ def decode_object_ref_if_needed(object_ref: str) -> bytes:
20
+ """Decode objectRef bytes string.
21
+
22
+ gRPC reply contains an objectRef that is encodded by Base64.
23
+ This function is used to decode the objectRef.
24
+ Note that there are times that objectRef is already decoded as
25
+ a hex string. In this case, just convert it to a binary number.
26
+ """
27
+ if object_ref.endswith("="):
28
+ # If the object ref ends with =, that means it is base64 encoded.
29
+ # Object refs will always have = as a padding
30
+ # when it is base64 encoded because objectRef is always 20B.
31
+ return base64.standard_b64decode(object_ref)
32
+ else:
33
+ return ray._private.utils.hex_to_binary(object_ref)
34
+
35
+
36
+ class SortingType(Enum):
37
+ PID = 1
38
+ OBJECT_SIZE = 3
39
+ REFERENCE_TYPE = 4
40
+
41
+
42
+ class GroupByType(Enum):
43
+ NODE_ADDRESS = "node"
44
+ STACK_TRACE = "stack_trace"
45
+
46
+
47
+ class ReferenceType(Enum):
48
+ # We don't use enum because enum is not json serializable.
49
+ ACTOR_HANDLE = "ACTOR_HANDLE"
50
+ PINNED_IN_MEMORY = "PINNED_IN_MEMORY"
51
+ LOCAL_REFERENCE = "LOCAL_REFERENCE"
52
+ USED_BY_PENDING_TASK = "USED_BY_PENDING_TASK"
53
+ CAPTURED_IN_OBJECT = "CAPTURED_IN_OBJECT"
54
+ UNKNOWN_STATUS = "UNKNOWN_STATUS"
55
+
56
+
57
+ def get_sorting_type(sort_by: str):
58
+ """Translate string input into SortingType instance"""
59
+ sort_by = sort_by.upper()
60
+ if sort_by == "PID":
61
+ return SortingType.PID
62
+ elif sort_by == "OBJECT_SIZE":
63
+ return SortingType.OBJECT_SIZE
64
+ elif sort_by == "REFERENCE_TYPE":
65
+ return SortingType.REFERENCE_TYPE
66
+ else:
67
+ raise Exception(
68
+ "The sort-by input provided is not one of\
69
+ PID, OBJECT_SIZE, or REFERENCE_TYPE."
70
+ )
71
+
72
+
73
+ def get_group_by_type(group_by: str):
74
+ """Translate string input into GroupByType instance"""
75
+ group_by = group_by.upper()
76
+ if group_by == "NODE_ADDRESS":
77
+ return GroupByType.NODE_ADDRESS
78
+ elif group_by == "STACK_TRACE":
79
+ return GroupByType.STACK_TRACE
80
+ else:
81
+ raise Exception(
82
+ "The group-by input provided is not one of\
83
+ NODE_ADDRESS or STACK_TRACE."
84
+ )
85
+
86
+
87
+ class MemoryTableEntry:
88
+ def __init__(
89
+ self, *, object_ref: dict, node_address: str, is_driver: bool, pid: int
90
+ ):
91
+ # worker info
92
+ self.is_driver = is_driver
93
+ self.pid = pid
94
+ self.node_address = node_address
95
+
96
+ # object info
97
+ self.task_status = object_ref.get("taskStatus", "?")
98
+ if self.task_status == "NIL":
99
+ self.task_status = "-"
100
+ self.attempt_number = int(object_ref.get("attemptNumber", 0)) + 1
101
+ self.object_size = int(object_ref.get("objectSize", -1))
102
+ self.call_site = object_ref.get("callSite", "<Unknown>")
103
+ if len(self.call_site) == 0:
104
+ self.call_site = "disabled"
105
+ self.object_ref = ray.ObjectRef(
106
+ decode_object_ref_if_needed(object_ref["objectId"])
107
+ )
108
+
109
+ # reference info
110
+ self.local_ref_count = int(object_ref.get("localRefCount", 0))
111
+ self.pinned_in_memory = bool(object_ref.get("pinnedInMemory", False))
112
+ self.submitted_task_ref_count = int(object_ref.get("submittedTaskRefCount", 0))
113
+ self.contained_in_owned = [
114
+ ray.ObjectRef(decode_object_ref_if_needed(object_ref))
115
+ for object_ref in object_ref.get("containedInOwned", [])
116
+ ]
117
+ self.reference_type = self._get_reference_type()
118
+
119
+ def is_valid(self) -> bool:
120
+ # If the entry doesn't have a reference type or some invalid state,
121
+ # (e.g., no object ref presented), it is considered invalid.
122
+ if (
123
+ not self.pinned_in_memory
124
+ and self.local_ref_count == 0
125
+ and self.submitted_task_ref_count == 0
126
+ and len(self.contained_in_owned) == 0
127
+ ):
128
+ return False
129
+ elif self.object_ref.is_nil():
130
+ return False
131
+ else:
132
+ return True
133
+
134
+ def group_key(self, group_by_type: GroupByType) -> str:
135
+ if group_by_type == GroupByType.NODE_ADDRESS:
136
+ return self.node_address
137
+ elif group_by_type == GroupByType.STACK_TRACE:
138
+ return self.call_site
139
+ else:
140
+ raise ValueError(f"group by type {group_by_type} is invalid.")
141
+
142
+ def _get_reference_type(self) -> str:
143
+ if self._is_object_ref_actor_handle():
144
+ return ReferenceType.ACTOR_HANDLE.value
145
+ if self.pinned_in_memory:
146
+ return ReferenceType.PINNED_IN_MEMORY.value
147
+ elif self.submitted_task_ref_count > 0:
148
+ return ReferenceType.USED_BY_PENDING_TASK.value
149
+ elif self.local_ref_count > 0:
150
+ return ReferenceType.LOCAL_REFERENCE.value
151
+ elif len(self.contained_in_owned) > 0:
152
+ return ReferenceType.CAPTURED_IN_OBJECT.value
153
+ else:
154
+ return ReferenceType.UNKNOWN_STATUS.value
155
+
156
+ def _is_object_ref_actor_handle(self) -> bool:
157
+ object_ref_hex = self.object_ref.hex()
158
+
159
+ # We need to multiply 2 because we need bits size instead of bytes size.
160
+ taskid_random_bits_size = (TASKID_BYTES_SIZE - ACTORID_BYTES_SIZE) * 2
161
+ actorid_random_bits_size = (ACTORID_BYTES_SIZE - JOBID_BYTES_SIZE) * 2
162
+
163
+ # random (8B) | ActorID(6B) | flag (2B) | index (6B)
164
+ # ActorID(6B) == ActorRandomByte(4B) + JobID(2B)
165
+ # If random bytes are all 'f', but ActorRandomBytes
166
+ # are not all 'f', that means it is an actor creation
167
+ # task, which is an actor handle.
168
+ random_bits = object_ref_hex[:taskid_random_bits_size]
169
+ actor_random_bits = object_ref_hex[
170
+ taskid_random_bits_size : taskid_random_bits_size + actorid_random_bits_size
171
+ ]
172
+ if random_bits == "f" * 16 and not actor_random_bits == "f" * 24:
173
+ return True
174
+ else:
175
+ return False
176
+
177
+ def as_dict(self):
178
+ return {
179
+ "object_ref": self.object_ref.hex(),
180
+ "pid": self.pid,
181
+ "node_ip_address": self.node_address,
182
+ "object_size": self.object_size,
183
+ "reference_type": self.reference_type,
184
+ "call_site": self.call_site,
185
+ "task_status": self.task_status,
186
+ "attempt_number": self.attempt_number,
187
+ "local_ref_count": self.local_ref_count,
188
+ "pinned_in_memory": self.pinned_in_memory,
189
+ "submitted_task_ref_count": self.submitted_task_ref_count,
190
+ "contained_in_owned": [
191
+ object_ref.hex() for object_ref in self.contained_in_owned
192
+ ],
193
+ "type": "Driver" if self.is_driver else "Worker",
194
+ }
195
+
196
+ def __str__(self):
197
+ return self.__repr__()
198
+
199
+ def __repr__(self):
200
+ return str(self.as_dict())
201
+
202
+
203
+ class MemoryTable:
204
+ def __init__(
205
+ self,
206
+ entries: List[MemoryTableEntry],
207
+ group_by_type: GroupByType = GroupByType.NODE_ADDRESS,
208
+ sort_by_type: SortingType = SortingType.PID,
209
+ ):
210
+ self.table = entries
211
+ # Group is a list of memory tables grouped by a group key.
212
+ self.group = {}
213
+ self.summary = defaultdict(int)
214
+ # NOTE YOU MUST SORT TABLE BEFORE GROUPING.
215
+ # self._group_by(..)._sort_by(..) != self._sort_by(..)._group_by(..)
216
+ if group_by_type and sort_by_type:
217
+ self.setup(group_by_type, sort_by_type)
218
+ elif group_by_type:
219
+ self._group_by(group_by_type)
220
+ elif sort_by_type:
221
+ self._sort_by(sort_by_type)
222
+
223
+ def setup(self, group_by_type: GroupByType, sort_by_type: SortingType):
224
+ """Setup memory table.
225
+
226
+ This will sort entries first and group them after.
227
+ Sort order will be still kept.
228
+ """
229
+ self._sort_by(sort_by_type)._group_by(group_by_type)
230
+ for group_memory_table in self.group.values():
231
+ group_memory_table.summarize()
232
+ self.summarize()
233
+ return self
234
+
235
+ def insert_entry(self, entry: MemoryTableEntry):
236
+ self.table.append(entry)
237
+
238
+ def summarize(self):
239
+ # Reset summary.
240
+ total_object_size = 0
241
+ total_local_ref_count = 0
242
+ total_pinned_in_memory = 0
243
+ total_used_by_pending_task = 0
244
+ total_captured_in_objects = 0
245
+ total_actor_handles = 0
246
+
247
+ for entry in self.table:
248
+ if entry.object_size > 0:
249
+ total_object_size += entry.object_size
250
+ if entry.reference_type == ReferenceType.LOCAL_REFERENCE.value:
251
+ total_local_ref_count += 1
252
+ elif entry.reference_type == ReferenceType.PINNED_IN_MEMORY.value:
253
+ total_pinned_in_memory += 1
254
+ elif entry.reference_type == ReferenceType.USED_BY_PENDING_TASK.value:
255
+ total_used_by_pending_task += 1
256
+ elif entry.reference_type == ReferenceType.CAPTURED_IN_OBJECT.value:
257
+ total_captured_in_objects += 1
258
+ elif entry.reference_type == ReferenceType.ACTOR_HANDLE.value:
259
+ total_actor_handles += 1
260
+
261
+ self.summary = {
262
+ "total_object_size": total_object_size,
263
+ "total_local_ref_count": total_local_ref_count,
264
+ "total_pinned_in_memory": total_pinned_in_memory,
265
+ "total_used_by_pending_task": total_used_by_pending_task,
266
+ "total_captured_in_objects": total_captured_in_objects,
267
+ "total_actor_handles": total_actor_handles,
268
+ }
269
+ return self
270
+
271
+ def _sort_by(self, sorting_type: SortingType):
272
+ if sorting_type == SortingType.PID:
273
+ self.table.sort(key=lambda entry: entry.pid)
274
+ elif sorting_type == SortingType.OBJECT_SIZE:
275
+ self.table.sort(key=lambda entry: entry.object_size)
276
+ elif sorting_type == SortingType.REFERENCE_TYPE:
277
+ self.table.sort(key=lambda entry: entry.reference_type)
278
+ else:
279
+ raise ValueError(f"Give sorting type: {sorting_type} is invalid.")
280
+ return self
281
+
282
+ def _group_by(self, group_by_type: GroupByType):
283
+ """Group entries and summarize the result.
284
+
285
+ NOTE: Each group is another MemoryTable.
286
+ """
287
+ # Reset group
288
+ self.group = {}
289
+
290
+ # Build entries per group.
291
+ group = defaultdict(list)
292
+ for entry in self.table:
293
+ group[entry.group_key(group_by_type)].append(entry)
294
+
295
+ # Build a group table.
296
+ for group_key, entries in group.items():
297
+ self.group[group_key] = MemoryTable(
298
+ entries, group_by_type=None, sort_by_type=None
299
+ )
300
+ for group_key, group_memory_table in self.group.items():
301
+ group_memory_table.summarize()
302
+ return self
303
+
304
+ def as_dict(self):
305
+ return {
306
+ "summary": self.summary,
307
+ "group": {
308
+ group_key: {
309
+ "entries": group_memory_table.get_entries(),
310
+ "summary": group_memory_table.summary,
311
+ }
312
+ for group_key, group_memory_table in self.group.items()
313
+ },
314
+ }
315
+
316
+ def get_entries(self) -> List[dict]:
317
+ return [entry.as_dict() for entry in self.table]
318
+
319
+ def __repr__(self):
320
+ return str(self.as_dict())
321
+
322
+ def __str__(self):
323
+ return self.__repr__()
324
+
325
+
326
+ def construct_memory_table(
327
+ workers_stats: List,
328
+ group_by: GroupByType = GroupByType.NODE_ADDRESS,
329
+ sort_by=SortingType.OBJECT_SIZE,
330
+ ) -> MemoryTable:
331
+ memory_table_entries = []
332
+ for core_worker_stats in workers_stats:
333
+ pid = core_worker_stats["pid"]
334
+ is_driver = core_worker_stats.get("workerType") == "DRIVER"
335
+ node_address = core_worker_stats["ipAddress"]
336
+ object_refs = core_worker_stats.get("objectRefs", [])
337
+
338
+ for object_ref in object_refs:
339
+ memory_table_entry = MemoryTableEntry(
340
+ object_ref=object_ref,
341
+ node_address=node_address,
342
+ is_driver=is_driver,
343
+ pid=pid,
344
+ )
345
+ if memory_table_entry.is_valid():
346
+ memory_table_entries.append(memory_table_entry)
347
+ memory_table = MemoryTable(
348
+ memory_table_entries, group_by_type=group_by, sort_by_type=sort_by
349
+ )
350
+ return memory_table
351
+
352
+
353
+ def track_reference_size(group):
354
+ """Returns dictionary mapping reference type
355
+ to memory usage for a given memory table group."""
356
+ d = defaultdict(int)
357
+ table_name = {
358
+ "LOCAL_REFERENCE": "total_local_ref_count",
359
+ "PINNED_IN_MEMORY": "total_pinned_in_memory",
360
+ "USED_BY_PENDING_TASK": "total_used_by_pending_task",
361
+ "CAPTURED_IN_OBJECT": "total_captured_in_objects",
362
+ "ACTOR_HANDLE": "total_actor_handles",
363
+ }
364
+ for entry in group["entries"]:
365
+ size = entry["object_size"]
366
+ if size == -1:
367
+ # size not recorded
368
+ size = 0
369
+ d[table_name[entry["reference_type"]]] += size
370
+ return d
371
+
372
+
373
+ def memory_summary(
374
+ state,
375
+ group_by="NODE_ADDRESS",
376
+ sort_by="OBJECT_SIZE",
377
+ line_wrap=True,
378
+ unit="B",
379
+ num_entries=None,
380
+ ) -> str:
381
+ # Get terminal size
382
+ import shutil
383
+
384
+ from ray.dashboard.modules.node.node_head import node_stats_to_dict
385
+
386
+ size = shutil.get_terminal_size((80, 20)).columns
387
+ line_wrap_threshold = 137
388
+
389
+ # Unit conversions
390
+ units = {"B": 10**0, "KB": 10**3, "MB": 10**6, "GB": 10**9}
391
+
392
+ # Fetch core memory worker stats, store as a dictionary
393
+ core_worker_stats = []
394
+ for raylet in state.node_table():
395
+ if not raylet["Alive"]:
396
+ continue
397
+ try:
398
+ stats = node_stats_to_dict(
399
+ node_stats(raylet["NodeManagerAddress"], raylet["NodeManagerPort"])
400
+ )
401
+ except RuntimeError:
402
+ continue
403
+ core_worker_stats.extend(stats["coreWorkersStats"])
404
+ assert type(stats) is dict and "coreWorkersStats" in stats
405
+
406
+ # Build memory table with "group_by" and "sort_by" parameters
407
+ group_by, sort_by = get_group_by_type(group_by), get_sorting_type(sort_by)
408
+ memory_table = construct_memory_table(
409
+ core_worker_stats, group_by, sort_by
410
+ ).as_dict()
411
+ assert "summary" in memory_table and "group" in memory_table
412
+
413
+ # Build memory summary
414
+ mem = ""
415
+ group_by, sort_by = group_by.name.lower().replace(
416
+ "_", " "
417
+ ), sort_by.name.lower().replace("_", " ")
418
+ summary_labels = [
419
+ "Mem Used by Objects",
420
+ "Local References",
421
+ "Pinned",
422
+ "Used by task",
423
+ "Captured in Objects",
424
+ "Actor Handles",
425
+ ]
426
+ summary_string = "{:<19} {:<16} {:<12} {:<13} {:<19} {:<13}\n"
427
+
428
+ object_ref_labels = [
429
+ "IP Address",
430
+ "PID",
431
+ "Type",
432
+ "Call Site",
433
+ "Status",
434
+ "Attampt",
435
+ "Size",
436
+ "Reference Type",
437
+ "Object Ref",
438
+ ]
439
+ object_ref_string = "{:<13} | {:<8} | {:<7} | {:<9} \
440
+ | {:<9} | {:<8} | {:<8} | {:<14} | {:<10}\n"
441
+
442
+ if size > line_wrap_threshold and line_wrap:
443
+ object_ref_string = "{:<15} {:<5} {:<6} {:<22} {:<14} {:<8} {:<6} \
444
+ {:<18} {:<56}\n"
445
+
446
+ mem += f"Grouping by {group_by}...\
447
+ Sorting by {sort_by}...\
448
+ Display {num_entries if num_entries is not None else 'all'}\
449
+ entries per group...\n\n\n"
450
+
451
+ for key, group in memory_table["group"].items():
452
+ # Group summary
453
+ summary = group["summary"]
454
+ ref_size = track_reference_size(group)
455
+ for k, v in summary.items():
456
+ if k == "total_object_size":
457
+ summary[k] = str(v / units[unit]) + f" {unit}"
458
+ else:
459
+ summary[k] = str(v) + f", ({ref_size[k] / units[unit]} {unit})"
460
+ mem += f"--- Summary for {group_by}: {key} ---\n"
461
+ mem += summary_string.format(*summary_labels)
462
+ mem += summary_string.format(*summary.values()) + "\n"
463
+
464
+ # Memory table per group
465
+ mem += f"--- Object references for {group_by}: {key} ---\n"
466
+ mem += object_ref_string.format(*object_ref_labels)
467
+ n = 1 # Counter for num entries per group
468
+ for entry in group["entries"]:
469
+ if num_entries is not None and n > num_entries:
470
+ break
471
+ entry["object_size"] = (
472
+ str(entry["object_size"] / units[unit]) + f" {unit}"
473
+ if entry["object_size"] > -1
474
+ else "?"
475
+ )
476
+ num_lines = 1
477
+ if size > line_wrap_threshold and line_wrap:
478
+ call_site_length = 22
479
+ if len(entry["call_site"]) == 0:
480
+ entry["call_site"] = ["disabled"]
481
+ else:
482
+ entry["call_site"] = [
483
+ entry["call_site"][i : i + call_site_length]
484
+ for i in range(0, len(entry["call_site"]), call_site_length)
485
+ ]
486
+
487
+ task_status_length = 12
488
+ entry["task_status"] = [
489
+ entry["task_status"][i : i + task_status_length]
490
+ for i in range(0, len(entry["task_status"]), task_status_length)
491
+ ]
492
+ num_lines = max(len(entry["call_site"]), len(entry["task_status"]))
493
+
494
+ else:
495
+ mem += "\n"
496
+ object_ref_values = [
497
+ entry["node_ip_address"],
498
+ entry["pid"],
499
+ entry["type"],
500
+ entry["call_site"],
501
+ entry["task_status"],
502
+ entry["attempt_number"],
503
+ entry["object_size"],
504
+ entry["reference_type"],
505
+ entry["object_ref"],
506
+ ]
507
+ for i in range(len(object_ref_values)):
508
+ if not isinstance(object_ref_values[i], list):
509
+ object_ref_values[i] = [object_ref_values[i]]
510
+ object_ref_values[i].extend(
511
+ ["" for x in range(num_lines - len(object_ref_values[i]))]
512
+ )
513
+ for i in range(num_lines):
514
+ row = [elem[i] for elem in object_ref_values]
515
+ mem += object_ref_string.format(*row)
516
+ mem += "\n"
517
+ n += 1
518
+
519
+ mem += (
520
+ "To record callsite information for each ObjectRef created, set "
521
+ "env variable RAY_record_ref_creation_sites=1\n\n"
522
+ )
523
+
524
+ return mem
.venv/lib/python3.11/site-packages/ray/dashboard/modules/__init__.py ADDED
File without changes
.venv/lib/python3.11/site-packages/ray/dashboard/modules/dashboard_sdk.py ADDED
@@ -0,0 +1,418 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import dataclasses
2
+ import importlib
3
+ import json
4
+ import logging
5
+ import os
6
+ import ssl
7
+ import tempfile
8
+ from pathlib import Path
9
+ from typing import Any, Dict, List, Optional, Union
10
+
11
+ import packaging.version
12
+ import yaml
13
+
14
+ import ray
15
+ from ray._private.runtime_env.packaging import (
16
+ create_package,
17
+ get_uri_for_directory,
18
+ get_uri_for_package,
19
+ )
20
+ from ray._private.runtime_env.py_modules import upload_py_modules_if_needed
21
+ from ray._private.runtime_env.working_dir import upload_working_dir_if_needed
22
+ from ray._private.utils import split_address
23
+ from ray.autoscaler._private.cli_logger import cli_logger
24
+ from ray.dashboard.modules.job.common import uri_to_http_components
25
+ from ray.util.annotations import DeveloperAPI, PublicAPI
26
+
27
+ try:
28
+ import requests
29
+ except ImportError:
30
+ requests = None
31
+
32
+
33
+ logger = logging.getLogger(__name__)
34
+ logger.setLevel(logging.INFO)
35
+
36
+ # By default, connect to local cluster.
37
+ DEFAULT_DASHBOARD_ADDRESS = "http://localhost:8265"
38
+
39
+
40
+ def parse_runtime_env_args(
41
+ runtime_env: Optional[str] = None,
42
+ runtime_env_json: Optional[str] = None,
43
+ working_dir: Optional[str] = None,
44
+ ):
45
+ """
46
+ Generates a runtime_env dictionary using `runtime_env`, `runtime_env_json`,
47
+ and `working_dir` CLI options. Only one of `runtime_env` or
48
+ `runtime_env_json` may be defined. `working_dir` overwrites the
49
+ `working_dir` from any other option.
50
+ """
51
+
52
+ final_runtime_env = {}
53
+ if runtime_env is not None:
54
+ if runtime_env_json is not None:
55
+ raise ValueError(
56
+ "Only one of --runtime_env and --runtime-env-json can be provided."
57
+ )
58
+ with open(runtime_env, "r") as f:
59
+ final_runtime_env = yaml.safe_load(f)
60
+
61
+ elif runtime_env_json is not None:
62
+ final_runtime_env = json.loads(runtime_env_json)
63
+
64
+ if working_dir is not None:
65
+ if "working_dir" in final_runtime_env:
66
+ cli_logger.warning(
67
+ "Overriding runtime_env working_dir with --working-dir option"
68
+ )
69
+
70
+ final_runtime_env["working_dir"] = working_dir
71
+
72
+ return final_runtime_env
73
+
74
+
75
+ @dataclasses.dataclass
76
+ class ClusterInfo:
77
+ address: str
78
+ cookies: Optional[Dict[str, Any]] = None
79
+ metadata: Optional[Dict[str, Any]] = None
80
+ headers: Optional[Dict[str, Any]] = None
81
+
82
+
83
+ # TODO (shrekris-anyscale): renaming breaks compatibility, do NOT rename
84
+ def get_job_submission_client_cluster_info(
85
+ address: str,
86
+ # For backwards compatibility
87
+ *,
88
+ # only used in importlib case in parse_cluster_info, but needed
89
+ # in function signature.
90
+ create_cluster_if_needed: Optional[bool] = False,
91
+ cookies: Optional[Dict[str, Any]] = None,
92
+ metadata: Optional[Dict[str, Any]] = None,
93
+ headers: Optional[Dict[str, Any]] = None,
94
+ _use_tls: Optional[bool] = False,
95
+ ) -> ClusterInfo:
96
+ """Get address, cookies, and metadata used for SubmissionClient.
97
+
98
+ If no port is specified in `address`, the Ray dashboard default will be
99
+ inserted.
100
+
101
+ Args:
102
+ address: Address without the module prefix that is passed
103
+ to SubmissionClient.
104
+ create_cluster_if_needed: Indicates whether the cluster
105
+ of the address returned needs to be running. Ray doesn't
106
+ start a cluster before interacting with jobs, but other
107
+ implementations may do so.
108
+
109
+ Returns:
110
+ ClusterInfo object consisting of address, cookies, and metadata
111
+ for SubmissionClient to use.
112
+ """
113
+
114
+ scheme = "https" if _use_tls else "http"
115
+ return ClusterInfo(
116
+ address=f"{scheme}://{address}",
117
+ cookies=cookies,
118
+ metadata=metadata,
119
+ headers=headers,
120
+ )
121
+
122
+
123
+ def parse_cluster_info(
124
+ address: Optional[str] = None,
125
+ create_cluster_if_needed: bool = False,
126
+ cookies: Optional[Dict[str, Any]] = None,
127
+ metadata: Optional[Dict[str, Any]] = None,
128
+ headers: Optional[Dict[str, Any]] = None,
129
+ ) -> ClusterInfo:
130
+ """Create a cluster if needed and return its address, cookies, and metadata."""
131
+ if address is None:
132
+ if (
133
+ ray.is_initialized()
134
+ and ray._private.worker.global_worker.node.address_info["webui_url"]
135
+ is not None
136
+ ):
137
+ address = (
138
+ "http://"
139
+ f"{ray._private.worker.global_worker.node.address_info['webui_url']}"
140
+ )
141
+ logger.info(
142
+ f"No address provided but Ray is running; using address {address}."
143
+ )
144
+ else:
145
+ logger.info(
146
+ f"No address provided, defaulting to {DEFAULT_DASHBOARD_ADDRESS}."
147
+ )
148
+ address = DEFAULT_DASHBOARD_ADDRESS
149
+
150
+ if address == "auto":
151
+ raise ValueError("Internal error: unexpected address 'auto'.")
152
+
153
+ if "://" not in address:
154
+ # Default to HTTP.
155
+ logger.info(
156
+ "No scheme (e.g. 'http://') or module string (e.g. 'ray://') "
157
+ f"provided in address {address}, defaulting to HTTP."
158
+ )
159
+ address = f"http://{address}"
160
+
161
+ module_string, inner_address = split_address(address)
162
+
163
+ if module_string == "ray":
164
+ raise ValueError(f"Internal error: unexpected Ray Client address {address}.")
165
+ # If user passes http(s)://, go through normal parsing.
166
+ if module_string in {"http", "https"}:
167
+ return get_job_submission_client_cluster_info(
168
+ inner_address,
169
+ create_cluster_if_needed=create_cluster_if_needed,
170
+ cookies=cookies,
171
+ metadata=metadata,
172
+ headers=headers,
173
+ _use_tls=(module_string == "https"),
174
+ )
175
+ # Try to dynamically import the function to get cluster info.
176
+ else:
177
+ try:
178
+ module = importlib.import_module(module_string)
179
+ except Exception:
180
+ raise RuntimeError(
181
+ f"Module: {module_string} does not exist.\n"
182
+ f"This module was parsed from address: {address}"
183
+ ) from None
184
+ assert "get_job_submission_client_cluster_info" in dir(module), (
185
+ f"Module: {module_string} does "
186
+ "not have `get_job_submission_client_cluster_info`.\n"
187
+ f"This module was parsed from address: {address}"
188
+ )
189
+
190
+ return module.get_job_submission_client_cluster_info(
191
+ inner_address,
192
+ create_cluster_if_needed=create_cluster_if_needed,
193
+ cookies=cookies,
194
+ metadata=metadata,
195
+ headers=headers,
196
+ )
197
+
198
+
199
+ class SubmissionClient:
200
+ def __init__(
201
+ self,
202
+ address: Optional[str] = None,
203
+ create_cluster_if_needed: bool = False,
204
+ cookies: Optional[Dict[str, Any]] = None,
205
+ metadata: Optional[Dict[str, Any]] = None,
206
+ headers: Optional[Dict[str, Any]] = None,
207
+ verify: Optional[Union[str, bool]] = True,
208
+ ):
209
+ # Remove any trailing slashes
210
+ if address is not None and address.endswith("/"):
211
+ address = address.rstrip("/")
212
+ logger.debug(
213
+ "The submission address cannot contain trailing slashes. Removing "
214
+ f'them from the requested submission address of "{address}".'
215
+ )
216
+
217
+ cluster_info = parse_cluster_info(
218
+ address, create_cluster_if_needed, cookies, metadata, headers
219
+ )
220
+ self._address = cluster_info.address
221
+ self._cookies = cluster_info.cookies
222
+ self._default_metadata = cluster_info.metadata or {}
223
+ # Headers used for all requests sent to job server, optional and only
224
+ # needed for cases like authentication to remote cluster.
225
+ self._headers = cluster_info.headers
226
+ # Set SSL verify parameter for the requests library and create an ssl_context
227
+ # object when needed for the aiohttp library.
228
+ self._verify = verify
229
+ if isinstance(self._verify, str):
230
+ if os.path.isdir(self._verify):
231
+ cafile, capath = None, self._verify
232
+ elif os.path.isfile(self._verify):
233
+ cafile, capath = self._verify, None
234
+ else:
235
+ raise FileNotFoundError(
236
+ f"Path to CA certificates: '{self._verify}', does not exist."
237
+ )
238
+ self._ssl_context = ssl.create_default_context(cafile=cafile, capath=capath)
239
+ else:
240
+ if self._verify is False:
241
+ self._ssl_context = False
242
+ else:
243
+ self._ssl_context = None
244
+
245
+ def _check_connection_and_version(
246
+ self, min_version: str = "1.9", version_error_message: str = None
247
+ ):
248
+ self._check_connection_and_version_with_url(min_version, version_error_message)
249
+
250
+ def _check_connection_and_version_with_url(
251
+ self,
252
+ min_version: str = "1.9",
253
+ version_error_message: str = None,
254
+ url: str = "/api/version",
255
+ ):
256
+ if version_error_message is None:
257
+ version_error_message = (
258
+ f"Please ensure the cluster is running Ray {min_version} or higher."
259
+ )
260
+
261
+ try:
262
+ r = self._do_request("GET", url)
263
+ if r.status_code == 404:
264
+ raise RuntimeError(
265
+ "Version check returned 404. " + version_error_message
266
+ )
267
+ r.raise_for_status()
268
+
269
+ running_ray_version = r.json()["ray_version"]
270
+ if packaging.version.parse(running_ray_version) < packaging.version.parse(
271
+ min_version
272
+ ):
273
+ raise RuntimeError(
274
+ f"Ray version {running_ray_version} is running on the cluster. "
275
+ + version_error_message
276
+ )
277
+ except requests.exceptions.ConnectionError:
278
+ raise ConnectionError(
279
+ f"Failed to connect to Ray at address: {self._address}."
280
+ )
281
+
282
+ def _raise_error(self, r: "requests.Response"):
283
+ raise RuntimeError(
284
+ f"Request failed with status code {r.status_code}: {r.text}."
285
+ )
286
+
287
+ def _do_request(
288
+ self,
289
+ method: str,
290
+ endpoint: str,
291
+ *,
292
+ data: Optional[bytes] = None,
293
+ json_data: Optional[dict] = None,
294
+ **kwargs,
295
+ ) -> "requests.Response":
296
+ """Perform the actual HTTP request
297
+
298
+ Keyword arguments other than "cookies", "headers" are forwarded to the
299
+ `requests.request()`.
300
+ """
301
+ url = self._address + endpoint
302
+ logger.debug(f"Sending request to {url} with json data: {json_data or {}}.")
303
+ return requests.request(
304
+ method,
305
+ url,
306
+ cookies=self._cookies,
307
+ data=data,
308
+ json=json_data,
309
+ headers=self._headers,
310
+ verify=self._verify,
311
+ **kwargs,
312
+ )
313
+
314
+ def _package_exists(
315
+ self,
316
+ package_uri: str,
317
+ ) -> bool:
318
+ protocol, package_name = uri_to_http_components(package_uri)
319
+ r = self._do_request("GET", f"/api/packages/{protocol}/{package_name}")
320
+
321
+ if r.status_code == 200:
322
+ logger.debug(f"Package {package_uri} already exists.")
323
+ return True
324
+ elif r.status_code == 404:
325
+ logger.debug(f"Package {package_uri} does not exist.")
326
+ return False
327
+ else:
328
+ self._raise_error(r)
329
+
330
+ def _upload_package(
331
+ self,
332
+ package_uri: str,
333
+ package_path: str,
334
+ include_parent_dir: Optional[bool] = False,
335
+ excludes: Optional[List[str]] = None,
336
+ is_file: bool = False,
337
+ ) -> bool:
338
+ logger.info(f"Uploading package {package_uri}.")
339
+ with tempfile.TemporaryDirectory() as tmp_dir:
340
+ protocol, package_name = uri_to_http_components(package_uri)
341
+ if is_file:
342
+ package_file = Path(package_path)
343
+ else:
344
+ package_file = Path(tmp_dir) / package_name
345
+ create_package(
346
+ package_path,
347
+ package_file,
348
+ include_parent_dir=include_parent_dir,
349
+ excludes=excludes,
350
+ )
351
+ try:
352
+ r = self._do_request(
353
+ "PUT",
354
+ f"/api/packages/{protocol}/{package_name}",
355
+ data=package_file.read_bytes(),
356
+ )
357
+ if r.status_code != 200:
358
+ self._raise_error(r)
359
+ finally:
360
+ # If the package is a user's existing file, don't delete it.
361
+ if not is_file:
362
+ package_file.unlink()
363
+
364
+ def _upload_package_if_needed(
365
+ self,
366
+ package_path: str,
367
+ include_parent_dir: bool = False,
368
+ excludes: Optional[List[str]] = None,
369
+ is_file: bool = False,
370
+ ) -> str:
371
+ if is_file:
372
+ package_uri = get_uri_for_package(Path(package_path))
373
+ else:
374
+ package_uri = get_uri_for_directory(package_path, excludes=excludes)
375
+
376
+ if not self._package_exists(package_uri):
377
+ self._upload_package(
378
+ package_uri,
379
+ package_path,
380
+ include_parent_dir=include_parent_dir,
381
+ excludes=excludes,
382
+ is_file=is_file,
383
+ )
384
+ else:
385
+ logger.info(f"Package {package_uri} already exists, skipping upload.")
386
+
387
+ return package_uri
388
+
389
+ def _upload_working_dir_if_needed(self, runtime_env: Dict[str, Any]):
390
+ def _upload_fn(working_dir, excludes, is_file=False):
391
+ self._upload_package_if_needed(
392
+ working_dir,
393
+ include_parent_dir=False,
394
+ excludes=excludes,
395
+ is_file=is_file,
396
+ )
397
+
398
+ upload_working_dir_if_needed(runtime_env, upload_fn=_upload_fn)
399
+
400
+ def _upload_py_modules_if_needed(self, runtime_env: Dict[str, Any]):
401
+ def _upload_fn(module_path, excludes, is_file=False):
402
+ self._upload_package_if_needed(
403
+ module_path, include_parent_dir=True, excludes=excludes, is_file=is_file
404
+ )
405
+
406
+ upload_py_modules_if_needed(runtime_env, upload_fn=_upload_fn)
407
+
408
+ @PublicAPI(stability="beta")
409
+ def get_version(self) -> str:
410
+ r = self._do_request("GET", "/api/version")
411
+ if r.status_code == 200:
412
+ return r.json().get("version")
413
+ else:
414
+ self._raise_error(r)
415
+
416
+ @DeveloperAPI
417
+ def get_address(self) -> str:
418
+ return self._address
.venv/lib/python3.11/site-packages/ray/dashboard/modules/data/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (199 Bytes). View file
 
.venv/lib/python3.11/site-packages/ray/dashboard/modules/data/__pycache__/data_head.cpython-311.pyc ADDED
Binary file (9.36 kB). View file
 
.venv/lib/python3.11/site-packages/ray/dashboard/modules/data/data_head.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import logging
3
+ import os
4
+ from enum import Enum
5
+ from urllib.parse import quote
6
+
7
+ import aiohttp
8
+ from aiohttp.web import Request, Response
9
+
10
+ import ray.dashboard.optional_utils as optional_utils
11
+ import ray.dashboard.utils as dashboard_utils
12
+ from ray.dashboard.modules.metrics.metrics_head import (
13
+ DEFAULT_PROMETHEUS_HEADERS,
14
+ DEFAULT_PROMETHEUS_HOST,
15
+ PROMETHEUS_HEADERS_ENV_VAR,
16
+ PROMETHEUS_HOST_ENV_VAR,
17
+ PrometheusQueryError,
18
+ parse_prom_headers,
19
+ )
20
+
21
+ logger = logging.getLogger(__name__)
22
+ logger.setLevel(logging.INFO)
23
+
24
+
25
+ # Window and sampling rate used for certain Prometheus queries.
26
+ # Datapoints up until `MAX_TIME_WINDOW` ago are queried at `SAMPLE_RATE` intervals.
27
+ MAX_TIME_WINDOW = "1h"
28
+ SAMPLE_RATE = "1s"
29
+
30
+
31
+ class PrometheusQuery(Enum):
32
+ """Enum to store types of Prometheus queries for a given metric and grouping."""
33
+
34
+ VALUE = ("value", "sum({}{{SessionName='{}'}}) by ({})")
35
+ MAX = (
36
+ "max",
37
+ "max_over_time(sum({}{{SessionName='{}'}}) by ({})["
38
+ + f"{MAX_TIME_WINDOW}:{SAMPLE_RATE}])",
39
+ )
40
+
41
+
42
+ DATASET_METRICS = {
43
+ "ray_data_output_rows": (PrometheusQuery.MAX,),
44
+ "ray_data_spilled_bytes": (PrometheusQuery.MAX,),
45
+ "ray_data_current_bytes": (PrometheusQuery.VALUE, PrometheusQuery.MAX),
46
+ "ray_data_cpu_usage_cores": (PrometheusQuery.VALUE, PrometheusQuery.MAX),
47
+ "ray_data_gpu_usage_cores": (PrometheusQuery.VALUE, PrometheusQuery.MAX),
48
+ }
49
+
50
+
51
+ class DataHead(dashboard_utils.DashboardHeadModule):
52
+ def __init__(self, config: dashboard_utils.DashboardHeadModuleConfig):
53
+ super().__init__(config)
54
+ self.prometheus_host = os.environ.get(
55
+ PROMETHEUS_HOST_ENV_VAR, DEFAULT_PROMETHEUS_HOST
56
+ )
57
+ self.prometheus_headers = parse_prom_headers(
58
+ os.environ.get(
59
+ PROMETHEUS_HEADERS_ENV_VAR,
60
+ DEFAULT_PROMETHEUS_HEADERS,
61
+ )
62
+ )
63
+
64
+ @optional_utils.DashboardHeadRouteTable.get("/api/data/datasets/{job_id}")
65
+ @optional_utils.init_ray_and_catch_exceptions()
66
+ async def get_datasets(self, req: Request) -> Response:
67
+ job_id = req.match_info["job_id"]
68
+
69
+ try:
70
+ from ray.data._internal.stats import _get_or_create_stats_actor
71
+
72
+ _stats_actor = _get_or_create_stats_actor()
73
+ datasets = await _stats_actor.get_datasets.remote(job_id)
74
+ # Initializes dataset metric values
75
+ for dataset in datasets:
76
+ for metric, queries in DATASET_METRICS.items():
77
+ datasets[dataset][metric] = {query.value[0]: 0 for query in queries}
78
+ for operator in datasets[dataset]["operators"]:
79
+ datasets[dataset]["operators"][operator][metric] = {
80
+ query.value[0]: 0 for query in queries
81
+ }
82
+ # Query dataset metric values from prometheus
83
+ try:
84
+ # TODO (Zandew): store results of completed datasets in stats actor.
85
+ for metric, queries in DATASET_METRICS.items():
86
+ for query in queries:
87
+ query_name, prom_query = query.value
88
+ # Dataset level
89
+ dataset_result = await self._query_prometheus(
90
+ prom_query.format(metric, self.session_name, "dataset")
91
+ )
92
+ for res in dataset_result["data"]["result"]:
93
+ dataset, value = res["metric"]["dataset"], res["value"][1]
94
+ if dataset in datasets:
95
+ datasets[dataset][metric][query_name] = value
96
+
97
+ # Operator level
98
+ operator_result = await self._query_prometheus(
99
+ prom_query.format(
100
+ metric, self.session_name, "dataset, operator"
101
+ )
102
+ )
103
+ for res in operator_result["data"]["result"]:
104
+ dataset, operator, value = (
105
+ res["metric"]["dataset"],
106
+ res["metric"]["operator"],
107
+ res["value"][1],
108
+ )
109
+ # Check if dataset/operator is in current _StatsActor scope.
110
+ # Prometheus server may contain metrics from previous
111
+ # cluster if not reset.
112
+ if (
113
+ dataset in datasets
114
+ and operator in datasets[dataset]["operators"]
115
+ ):
116
+ datasets[dataset]["operators"][operator][metric][
117
+ query_name
118
+ ] = value
119
+ except aiohttp.client_exceptions.ClientConnectorError:
120
+ # Prometheus server may not be running,
121
+ # leave these values blank and return other data
122
+ logging.exception(
123
+ "Exception occurred while querying Prometheus. "
124
+ "The Prometheus server may not be running."
125
+ )
126
+ # Flatten response
127
+ for dataset in datasets:
128
+ datasets[dataset]["operators"] = list(
129
+ map(
130
+ lambda item: {"operator": item[0], **item[1]},
131
+ datasets[dataset]["operators"].items(),
132
+ )
133
+ )
134
+ datasets = list(
135
+ map(lambda item: {"dataset": item[0], **item[1]}, datasets.items())
136
+ )
137
+ # Sort by descending start time
138
+ datasets = sorted(datasets, key=lambda x: x["start_time"], reverse=True)
139
+ return Response(
140
+ text=json.dumps({"datasets": datasets}),
141
+ content_type="application/json",
142
+ )
143
+ except Exception as e:
144
+ logging.exception("Exception occured while getting datasets.")
145
+ return Response(
146
+ status=503,
147
+ text=str(e),
148
+ )
149
+
150
+ async def run(self, server):
151
+ pass
152
+
153
+ @staticmethod
154
+ def is_minimal_module():
155
+ return False
156
+
157
+ async def _query_prometheus(self, query):
158
+ async with self.http_session.get(
159
+ f"{self.prometheus_host}/api/v1/query?query={quote(query)}",
160
+ headers=self.prometheus_headers,
161
+ ) as resp:
162
+ if resp.status == 200:
163
+ prom_data = await resp.json()
164
+ return prom_data
165
+
166
+ message = await resp.text()
167
+ raise PrometheusQueryError(resp.status, message)
.venv/lib/python3.11/site-packages/ray/dashboard/modules/job/__init__.py ADDED
File without changes
.venv/lib/python3.11/site-packages/ray/dashboard/modules/job/__pycache__/cli.cpython-311.pyc ADDED
Binary file (20.6 kB). View file
 
.venv/lib/python3.11/site-packages/ray/dashboard/modules/job/__pycache__/cli_utils.cpython-311.pyc ADDED
Binary file (2.57 kB). View file
 
.venv/lib/python3.11/site-packages/ray/dashboard/modules/job/__pycache__/job_agent.cpython-311.pyc ADDED
Binary file (11.8 kB). View file
 
.venv/lib/python3.11/site-packages/ray/dashboard/modules/job/__pycache__/job_head.cpython-311.pyc ADDED
Binary file (32.1 kB). View file
 
.venv/lib/python3.11/site-packages/ray/dashboard/modules/job/__pycache__/job_manager.cpython-311.pyc ADDED
Binary file (28.4 kB). View file
 
.venv/lib/python3.11/site-packages/ray/dashboard/modules/job/__pycache__/job_supervisor.cpython-311.pyc ADDED
Binary file (22.3 kB). View file
 
.venv/lib/python3.11/site-packages/ray/dashboard/modules/job/__pycache__/pydantic_models.cpython-311.pyc ADDED
Binary file (5.29 kB). View file
 
.venv/lib/python3.11/site-packages/ray/dashboard/modules/job/cli.py ADDED
@@ -0,0 +1,521 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import pprint
4
+ import sys
5
+ import time
6
+ from subprocess import list2cmdline
7
+ from typing import Any, Dict, Optional, Tuple, Union
8
+
9
+ import click
10
+
11
+ import ray._private.ray_constants as ray_constants
12
+ from ray._private.storage import _load_class
13
+ from ray._private.utils import (
14
+ get_or_create_event_loop,
15
+ parse_metadata_json,
16
+ parse_resources_json,
17
+ )
18
+ from ray.autoscaler._private.cli_logger import add_click_logging_options, cf, cli_logger
19
+ from ray.dashboard.modules.dashboard_sdk import parse_runtime_env_args
20
+ from ray.dashboard.modules.job.cli_utils import add_common_job_options
21
+ from ray.dashboard.modules.job.utils import redact_url_password
22
+ from ray.job_submission import JobStatus, JobSubmissionClient
23
+ from ray.util.annotations import PublicAPI
24
+
25
+
26
+ def _get_sdk_client(
27
+ address: Optional[str],
28
+ create_cluster_if_needed: bool = False,
29
+ headers: Optional[str] = None,
30
+ verify: Union[bool, str] = True,
31
+ ) -> JobSubmissionClient:
32
+ client = JobSubmissionClient(
33
+ address,
34
+ create_cluster_if_needed,
35
+ headers=_handle_headers(headers),
36
+ verify=verify,
37
+ )
38
+ client_address = client.get_address()
39
+ cli_logger.labeled_value(
40
+ "Job submission server address", redact_url_password(client_address)
41
+ )
42
+ return client
43
+
44
+
45
+ def _handle_headers(headers: Optional[str]) -> Optional[Dict[str, Any]]:
46
+ if headers is None and "RAY_JOB_HEADERS" in os.environ:
47
+ headers = os.environ["RAY_JOB_HEADERS"]
48
+ if headers is not None:
49
+ try:
50
+ return json.loads(headers)
51
+ except Exception as exc:
52
+ raise ValueError(
53
+ """Failed to parse headers into JSON.
54
+ Expected format: {{"KEY": "VALUE"}}, got {}, {}""".format(
55
+ headers, exc
56
+ )
57
+ )
58
+ return None
59
+
60
+
61
+ def _log_big_success_msg(success_msg):
62
+ cli_logger.newline()
63
+ cli_logger.success("-" * len(success_msg))
64
+ cli_logger.success(success_msg)
65
+ cli_logger.success("-" * len(success_msg))
66
+ cli_logger.newline()
67
+
68
+
69
+ def _log_big_error_msg(success_msg):
70
+ cli_logger.newline()
71
+ cli_logger.error("-" * len(success_msg))
72
+ cli_logger.error(success_msg)
73
+ cli_logger.error("-" * len(success_msg))
74
+ cli_logger.newline()
75
+
76
+
77
+ def _log_job_status(client: JobSubmissionClient, job_id: str) -> JobStatus:
78
+ info = client.get_job_info(job_id)
79
+ if info.status == JobStatus.SUCCEEDED:
80
+ _log_big_success_msg(f"Job '{job_id}' succeeded")
81
+ elif info.status == JobStatus.STOPPED:
82
+ cli_logger.warning(f"Job '{job_id}' was stopped")
83
+ elif info.status == JobStatus.FAILED:
84
+ _log_big_error_msg(f"Job '{job_id}' failed")
85
+ if info.message is not None:
86
+ cli_logger.print(f"Status message: {info.message}", no_format=True)
87
+ else:
88
+ # Catch-all.
89
+ cli_logger.print(f"Status for job '{job_id}': {info.status}")
90
+ if info.message is not None:
91
+ cli_logger.print(f"Status message: {info.message}", no_format=True)
92
+ return info.status
93
+
94
+
95
+ async def _tail_logs(client: JobSubmissionClient, job_id: str) -> JobStatus:
96
+ async for lines in client.tail_job_logs(job_id):
97
+ print(lines, end="")
98
+
99
+ return _log_job_status(client, job_id)
100
+
101
+
102
+ @click.group("job")
103
+ def job_cli_group():
104
+ """Submit, stop, delete, or list Ray jobs."""
105
+ pass
106
+
107
+
108
+ @job_cli_group.command()
109
+ @click.option(
110
+ "--address",
111
+ type=str,
112
+ default=None,
113
+ required=False,
114
+ help=(
115
+ "Address of the Ray cluster to connect to. Can also be specified "
116
+ "using the RAY_ADDRESS environment variable."
117
+ ),
118
+ )
119
+ @click.option(
120
+ "--job-id",
121
+ type=str,
122
+ default=None,
123
+ required=False,
124
+ help=("DEPRECATED: Use `--submission-id` instead."),
125
+ )
126
+ @click.option(
127
+ "--submission-id",
128
+ type=str,
129
+ default=None,
130
+ required=False,
131
+ help=(
132
+ "Submission ID to specify for the job. "
133
+ "If not provided, one will be generated."
134
+ ),
135
+ )
136
+ @click.option(
137
+ "--runtime-env",
138
+ type=str,
139
+ default=None,
140
+ required=False,
141
+ help="Path to a local YAML file containing a runtime_env definition.",
142
+ )
143
+ @click.option(
144
+ "--runtime-env-json",
145
+ type=str,
146
+ default=None,
147
+ required=False,
148
+ help="JSON-serialized runtime_env dictionary.",
149
+ )
150
+ @click.option(
151
+ "--working-dir",
152
+ type=str,
153
+ default=None,
154
+ required=False,
155
+ help=(
156
+ "Directory containing files that your job will run in. Can be a "
157
+ "local directory or a remote URI to a .zip file (S3, GS, HTTP). "
158
+ "If specified, this overrides the option in `--runtime-env`."
159
+ ),
160
+ )
161
+ @click.option(
162
+ "--metadata-json",
163
+ type=str,
164
+ default=None,
165
+ required=False,
166
+ help="JSON-serialized dictionary of metadata to attach to the job.",
167
+ )
168
+ @click.option(
169
+ "--entrypoint-num-cpus",
170
+ required=False,
171
+ type=float,
172
+ help="the quantity of CPU cores to reserve for the entrypoint command, "
173
+ "separately from any tasks or actors that are launched by it",
174
+ )
175
+ @click.option(
176
+ "--entrypoint-num-gpus",
177
+ required=False,
178
+ type=float,
179
+ help="the quantity of GPUs to reserve for the entrypoint command, "
180
+ "separately from any tasks or actors that are launched by it",
181
+ )
182
+ @click.option(
183
+ "--entrypoint-memory",
184
+ required=False,
185
+ type=int,
186
+ help="the amount of memory to reserve "
187
+ "for the entrypoint command, separately from any tasks or actors that are "
188
+ "launched by it",
189
+ )
190
+ @click.option(
191
+ "--entrypoint-resources",
192
+ required=False,
193
+ type=str,
194
+ help="a JSON-serialized dictionary mapping resource name to resource quantity "
195
+ "describing resources to reserve for the entrypoint command, "
196
+ "separately from any tasks or actors that are launched by it",
197
+ )
198
+ @click.option(
199
+ "--no-wait",
200
+ is_flag=True,
201
+ type=bool,
202
+ default=False,
203
+ help="If set, will not stream logs and wait for the job to exit.",
204
+ )
205
+ @add_common_job_options
206
+ @add_click_logging_options
207
+ @click.argument("entrypoint", nargs=-1, required=True, type=click.UNPROCESSED)
208
+ @PublicAPI
209
+ def submit(
210
+ address: Optional[str],
211
+ job_id: Optional[str],
212
+ submission_id: Optional[str],
213
+ runtime_env: Optional[str],
214
+ runtime_env_json: Optional[str],
215
+ metadata_json: Optional[str],
216
+ working_dir: Optional[str],
217
+ entrypoint: Tuple[str],
218
+ entrypoint_num_cpus: Optional[Union[int, float]],
219
+ entrypoint_num_gpus: Optional[Union[int, float]],
220
+ entrypoint_memory: Optional[int],
221
+ entrypoint_resources: Optional[str],
222
+ no_wait: bool,
223
+ verify: Union[bool, str],
224
+ headers: Optional[str],
225
+ ):
226
+ """Submits a job to be run on the cluster.
227
+
228
+ By default (if --no-wait is not set), streams logs to stdout until the job finishes.
229
+ If the job succeeded, exits with 0. If it failed, exits with 1.
230
+
231
+ Example:
232
+ `ray job submit -- python my_script.py --arg=val`
233
+ """
234
+ if job_id:
235
+ cli_logger.warning(
236
+ "--job-id option is deprecated. Please use --submission-id instead."
237
+ )
238
+ if entrypoint_resources is not None:
239
+ entrypoint_resources = parse_resources_json(
240
+ entrypoint_resources, cli_logger, cf, command_arg="entrypoint-resources"
241
+ )
242
+ if metadata_json is not None:
243
+ metadata_json = parse_metadata_json(
244
+ metadata_json, cli_logger, cf, command_arg="metadata-json"
245
+ )
246
+
247
+ submission_id = submission_id or job_id
248
+
249
+ if ray_constants.RAY_JOB_SUBMIT_HOOK in os.environ:
250
+ # Submit all args as **kwargs per the JOB_SUBMIT_HOOK contract.
251
+ _load_class(os.environ[ray_constants.RAY_JOB_SUBMIT_HOOK])(
252
+ address=address,
253
+ job_id=submission_id,
254
+ submission_id=submission_id,
255
+ runtime_env=runtime_env,
256
+ runtime_env_json=runtime_env_json,
257
+ metadata_json=metadata_json,
258
+ working_dir=working_dir,
259
+ entrypoint=entrypoint,
260
+ entrypoint_num_cpus=entrypoint_num_cpus,
261
+ entrypoint_num_gpus=entrypoint_num_gpus,
262
+ entrypoint_memory=entrypoint_memory,
263
+ entrypoint_resources=entrypoint_resources,
264
+ no_wait=no_wait,
265
+ )
266
+
267
+ client = _get_sdk_client(
268
+ address, create_cluster_if_needed=True, headers=headers, verify=verify
269
+ )
270
+
271
+ final_runtime_env = parse_runtime_env_args(
272
+ runtime_env=runtime_env,
273
+ runtime_env_json=runtime_env_json,
274
+ working_dir=working_dir,
275
+ )
276
+ job_id = client.submit_job(
277
+ entrypoint=list2cmdline(entrypoint),
278
+ submission_id=submission_id,
279
+ runtime_env=final_runtime_env,
280
+ metadata=metadata_json,
281
+ entrypoint_num_cpus=entrypoint_num_cpus,
282
+ entrypoint_num_gpus=entrypoint_num_gpus,
283
+ entrypoint_memory=entrypoint_memory,
284
+ entrypoint_resources=entrypoint_resources,
285
+ )
286
+
287
+ _log_big_success_msg(f"Job '{job_id}' submitted successfully")
288
+
289
+ with cli_logger.group("Next steps"):
290
+ cli_logger.print("Query the logs of the job:")
291
+ with cli_logger.indented():
292
+ cli_logger.print(cf.bold(f"ray job logs {job_id}"))
293
+
294
+ cli_logger.print("Query the status of the job:")
295
+ with cli_logger.indented():
296
+ cli_logger.print(cf.bold(f"ray job status {job_id}"))
297
+
298
+ cli_logger.print("Request the job to be stopped:")
299
+ with cli_logger.indented():
300
+ cli_logger.print(cf.bold(f"ray job stop {job_id}"))
301
+
302
+ cli_logger.newline()
303
+ sdk_version = client.get_version()
304
+ # sdk version 0 does not have log streaming
305
+ if not no_wait:
306
+ if int(sdk_version) > 0:
307
+ cli_logger.print(
308
+ "Tailing logs until the job exits (disable with --no-wait):"
309
+ )
310
+ job_status = get_or_create_event_loop().run_until_complete(
311
+ _tail_logs(client, job_id)
312
+ )
313
+ if job_status == JobStatus.FAILED:
314
+ sys.exit(1)
315
+ else:
316
+ cli_logger.warning(
317
+ "Tailing logs is not enabled for job sdk client version "
318
+ f"{sdk_version}. Please upgrade Ray to the latest version "
319
+ "for this feature."
320
+ )
321
+
322
+
323
+ @job_cli_group.command()
324
+ @click.option(
325
+ "--address",
326
+ type=str,
327
+ default=None,
328
+ required=False,
329
+ help=(
330
+ "Address of the Ray cluster to connect to. Can also be specified "
331
+ "using the `RAY_ADDRESS` environment variable."
332
+ ),
333
+ )
334
+ @click.argument("job-id", type=str)
335
+ @add_common_job_options
336
+ @add_click_logging_options
337
+ @PublicAPI(stability="stable")
338
+ def status(
339
+ address: Optional[str],
340
+ job_id: str,
341
+ headers: Optional[str],
342
+ verify: Union[bool, str],
343
+ ):
344
+ """Queries for the current status of a job.
345
+
346
+ Example:
347
+ `ray job status <my_job_id>`
348
+ """
349
+ client = _get_sdk_client(address, headers=headers, verify=verify)
350
+ _log_job_status(client, job_id)
351
+
352
+
353
+ @job_cli_group.command()
354
+ @click.option(
355
+ "--address",
356
+ type=str,
357
+ default=None,
358
+ required=False,
359
+ help=(
360
+ "Address of the Ray cluster to connect to. Can also be specified "
361
+ "using the `RAY_ADDRESS` environment variable."
362
+ ),
363
+ )
364
+ @click.option(
365
+ "--no-wait",
366
+ is_flag=True,
367
+ type=bool,
368
+ default=False,
369
+ help="If set, will not wait for the job to exit.",
370
+ )
371
+ @click.argument("job-id", type=str)
372
+ @add_common_job_options
373
+ @add_click_logging_options
374
+ @PublicAPI(stability="stable")
375
+ def stop(
376
+ address: Optional[str],
377
+ no_wait: bool,
378
+ job_id: str,
379
+ headers: Optional[str],
380
+ verify: Union[bool, str],
381
+ ):
382
+ """Attempts to stop a job.
383
+
384
+ Example:
385
+ `ray job stop <my_job_id>`
386
+ """
387
+ client = _get_sdk_client(address, headers=headers, verify=verify)
388
+ cli_logger.print(f"Attempting to stop job '{job_id}'")
389
+ client.stop_job(job_id)
390
+
391
+ if no_wait:
392
+ return
393
+ else:
394
+ cli_logger.print(
395
+ f"Waiting for job '{job_id}' to exit " f"(disable with --no-wait):"
396
+ )
397
+
398
+ while True:
399
+ status = client.get_job_status(job_id)
400
+ if status in {JobStatus.STOPPED, JobStatus.SUCCEEDED, JobStatus.FAILED}:
401
+ _log_job_status(client, job_id)
402
+ break
403
+ else:
404
+ cli_logger.print(f"Job has not exited yet. Status: {status}")
405
+ time.sleep(1)
406
+
407
+
408
+ @job_cli_group.command()
409
+ @click.option(
410
+ "--address",
411
+ type=str,
412
+ default=None,
413
+ required=False,
414
+ help=(
415
+ "Address of the Ray cluster to connect to. Can also be specified "
416
+ "using the RAY_ADDRESS environment variable."
417
+ ),
418
+ )
419
+ @click.argument("job-id", type=str)
420
+ @add_common_job_options
421
+ @add_click_logging_options
422
+ @PublicAPI(stability="stable")
423
+ def delete(
424
+ address: Optional[str],
425
+ job_id: str,
426
+ headers: Optional[str],
427
+ verify: Union[bool, str],
428
+ ):
429
+ """Deletes a stopped job and its associated data from memory.
430
+
431
+ Only supported for jobs that are already in a terminal state.
432
+ Fails with exit code 1 if the job is not already stopped.
433
+ Does not delete job logs from disk.
434
+ Submitting a job with the same submission ID as a previously
435
+ deleted job is not supported and may lead to unexpected behavior.
436
+
437
+ Example:
438
+ ray job delete <my_job_id>
439
+ """
440
+ client = _get_sdk_client(address, headers=headers, verify=verify)
441
+ client.delete_job(job_id)
442
+ cli_logger.print(f"Job '{job_id}' deleted successfully")
443
+
444
+
445
+ @job_cli_group.command()
446
+ @click.option(
447
+ "--address",
448
+ type=str,
449
+ default=None,
450
+ required=False,
451
+ help=(
452
+ "Address of the Ray cluster to connect to. Can also be specified "
453
+ "using the RAY_ADDRESS environment variable."
454
+ ),
455
+ )
456
+ @click.argument("job-id", type=str)
457
+ @click.option(
458
+ "-f",
459
+ "--follow",
460
+ is_flag=True,
461
+ type=bool,
462
+ default=False,
463
+ help="If set, follow the logs (like `tail -f`).",
464
+ )
465
+ @add_common_job_options
466
+ @add_click_logging_options
467
+ @PublicAPI(stability="stable")
468
+ def logs(
469
+ address: Optional[str],
470
+ job_id: str,
471
+ follow: bool,
472
+ headers: Optional[str],
473
+ verify: Union[bool, str],
474
+ ):
475
+ """Gets the logs of a job.
476
+
477
+ Example:
478
+ `ray job logs <my_job_id>`
479
+ """
480
+ client = _get_sdk_client(address, headers=headers, verify=verify)
481
+ sdk_version = client.get_version()
482
+ # sdk version 0 did not have log streaming
483
+ if follow:
484
+ if int(sdk_version) > 0:
485
+ get_or_create_event_loop().run_until_complete(_tail_logs(client, job_id))
486
+ else:
487
+ cli_logger.warning(
488
+ "Tailing logs is not enabled for the Jobs SDK client version "
489
+ f"{sdk_version}. Please upgrade Ray to latest version "
490
+ "for this feature."
491
+ )
492
+ else:
493
+ # Set no_format to True because the logs may have unescaped "{" and "}"
494
+ # and the CLILogger calls str.format().
495
+ cli_logger.print(client.get_job_logs(job_id), end="", no_format=True)
496
+
497
+
498
+ @job_cli_group.command()
499
+ @click.option(
500
+ "--address",
501
+ type=str,
502
+ default=None,
503
+ required=False,
504
+ help=(
505
+ "Address of the Ray cluster to connect to. Can also be specified "
506
+ "using the RAY_ADDRESS environment variable."
507
+ ),
508
+ )
509
+ @add_common_job_options
510
+ @add_click_logging_options
511
+ @PublicAPI(stability="stable")
512
+ def list(address: Optional[str], headers: Optional[str], verify: Union[bool, str]):
513
+ """Lists all running jobs and their information.
514
+
515
+ Example:
516
+ `ray job list`
517
+ """
518
+ client = _get_sdk_client(address, headers=headers, verify=verify)
519
+ # Set no_format to True because the logs may have unescaped "{" and "}"
520
+ # and the CLILogger calls str.format().
521
+ cli_logger.print(pprint.pformat(client.list_jobs()), no_format=True)
.venv/lib/python3.11/site-packages/ray/dashboard/modules/job/cli_utils.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import functools
2
+ from typing import Union
3
+
4
+ import click
5
+
6
+
7
+ def bool_cast(string: str) -> Union[bool, str]:
8
+ """Cast a string to a boolean if possible, otherwise return the string."""
9
+ if string.lower() == "true" or string == "1":
10
+ return True
11
+ elif string.lower() == "false" or string == "0":
12
+ return False
13
+ else:
14
+ return string
15
+
16
+
17
+ class BoolOrStringParam(click.ParamType):
18
+ """A click parameter that can be either a boolean or a string."""
19
+
20
+ name = "BOOL | TEXT"
21
+
22
+ def convert(self, value, param, ctx):
23
+ if isinstance(value, bool):
24
+ return value
25
+ else:
26
+ return bool_cast(value)
27
+
28
+
29
+ def add_common_job_options(func):
30
+ """Decorator for adding CLI flags shared by all `ray job` commands."""
31
+
32
+ @click.option(
33
+ "--verify",
34
+ default=True,
35
+ show_default=True,
36
+ type=BoolOrStringParam(),
37
+ help=(
38
+ "Boolean indication to verify the server's TLS certificate or a path to"
39
+ " a file or directory of trusted certificates."
40
+ ),
41
+ )
42
+ @click.option(
43
+ "--headers",
44
+ required=False,
45
+ type=str,
46
+ default=None,
47
+ help=(
48
+ "Used to pass headers through http/s to the Ray Cluster."
49
+ 'please follow JSON formatting formatting {"key": "value"}'
50
+ ),
51
+ )
52
+ @functools.wraps(func)
53
+ def wrapper(*args, **kwargs):
54
+ return func(*args, **kwargs)
55
+
56
+ return wrapper
.venv/lib/python3.11/site-packages/ray/dashboard/modules/job/common.py ADDED
@@ -0,0 +1,538 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import json
3
+ import logging
4
+ import time
5
+ from dataclasses import asdict, dataclass, replace
6
+ from enum import Enum
7
+ from pathlib import Path
8
+ from typing import Any, Dict, Optional, Tuple, Union
9
+
10
+ from ray._private import ray_constants
11
+ from ray._private.event.export_event_logger import (
12
+ check_export_api_enabled,
13
+ get_export_event_logger,
14
+ )
15
+ from ray._private.gcs_utils import GcsAioClient
16
+ from ray._private.runtime_env.packaging import parse_uri
17
+ from ray.core.generated.export_event_pb2 import ExportEvent
18
+ from ray.core.generated.export_submission_job_event_pb2 import (
19
+ ExportSubmissionJobEventData,
20
+ )
21
+ from ray.util.annotations import PublicAPI
22
+
23
+ # NOTE(edoakes): these constants should be considered a public API because
24
+ # they're exposed in the snapshot API.
25
+ JOB_ID_METADATA_KEY = "job_submission_id"
26
+ JOB_NAME_METADATA_KEY = "job_name"
27
+ JOB_ACTOR_NAME_TEMPLATE = (
28
+ f"{ray_constants.RAY_INTERNAL_NAMESPACE_PREFIX}job_actor_" + "{job_id}"
29
+ )
30
+ # In order to get information about SupervisorActors launched by different jobs,
31
+ # they must be set to the same namespace.
32
+ SUPERVISOR_ACTOR_RAY_NAMESPACE = "SUPERVISOR_ACTOR_RAY_NAMESPACE"
33
+ JOB_LOGS_PATH_TEMPLATE = "job-driver-{submission_id}.log"
34
+
35
+ logger = logging.getLogger(__name__)
36
+
37
+
38
+ @PublicAPI(stability="stable")
39
+ class JobStatus(str, Enum):
40
+ """An enumeration for describing the status of a job."""
41
+
42
+ #: The job has not started yet, likely waiting for the runtime_env to be set up.
43
+ PENDING = "PENDING"
44
+ #: The job is currently running.
45
+ RUNNING = "RUNNING"
46
+ #: The job was intentionally stopped by the user.
47
+ STOPPED = "STOPPED"
48
+ #: The job finished successfully.
49
+ SUCCEEDED = "SUCCEEDED"
50
+ #: The job failed.
51
+ FAILED = "FAILED"
52
+
53
+ def __str__(self) -> str:
54
+ return f"{self.value}"
55
+
56
+ def is_terminal(self) -> bool:
57
+ """Return whether or not this status is terminal.
58
+
59
+ A terminal status is one that cannot transition to any other status.
60
+ The terminal statuses are "STOPPED", "SUCCEEDED", and "FAILED".
61
+
62
+ Returns:
63
+ True if this status is terminal, otherwise False.
64
+ """
65
+ return self.value in {"STOPPED", "SUCCEEDED", "FAILED"}
66
+
67
+
68
+ # TODO(aguo): Convert to pydantic model
69
+ @PublicAPI(stability="stable")
70
+ @dataclass
71
+ class JobInfo:
72
+ """A class for recording information associated with a job and its execution.
73
+
74
+ Please keep this in sync with the JobsAPIInfo proto in src/ray/protobuf/gcs.proto.
75
+ """
76
+
77
+ #: The status of the job.
78
+ status: JobStatus
79
+ #: The entrypoint command for this job.
80
+ entrypoint: str
81
+ #: A message describing the status in more detail.
82
+ message: Optional[str] = None
83
+ # TODO(architkulkarni): Populate this field with e.g. Runtime env setup failure,
84
+ #: Internal error, user script error
85
+ error_type: Optional[str] = None
86
+ #: The time when the job was started. A Unix timestamp in ms.
87
+ start_time: Optional[int] = None
88
+ #: The time when the job moved into a terminal state. A Unix timestamp in ms.
89
+ end_time: Optional[int] = None
90
+ #: Arbitrary user-provided metadata for the job.
91
+ metadata: Optional[Dict[str, str]] = None
92
+ #: The runtime environment for the job.
93
+ runtime_env: Optional[Dict[str, Any]] = None
94
+ #: The quantity of CPU cores to reserve for the entrypoint command.
95
+ entrypoint_num_cpus: Optional[Union[int, float]] = None
96
+ #: The number of GPUs to reserve for the entrypoint command.
97
+ entrypoint_num_gpus: Optional[Union[int, float]] = None
98
+ #: The amount of memory for workers requesting memory for the entrypoint command.
99
+ entrypoint_memory: Optional[int] = None
100
+ #: The quantity of various custom resources to reserve for the entrypoint command.
101
+ entrypoint_resources: Optional[Dict[str, float]] = None
102
+ #: Driver agent http address
103
+ driver_agent_http_address: Optional[str] = None
104
+ #: The node id that driver running on. It will be None only when the job status
105
+ # is PENDING, and this field will not be deleted or modified even if the driver dies
106
+ driver_node_id: Optional[str] = None
107
+ #: The driver process exit code after the driver executed. Return None if driver
108
+ #: doesn't finish executing
109
+ driver_exit_code: Optional[int] = None
110
+
111
+ def __post_init__(self):
112
+ if isinstance(self.status, str):
113
+ self.status = JobStatus(self.status)
114
+ if self.message is None:
115
+ if self.status == JobStatus.PENDING:
116
+ self.message = "Job has not started yet."
117
+ if any(
118
+ [
119
+ self.entrypoint_num_cpus is not None
120
+ and self.entrypoint_num_cpus > 0,
121
+ self.entrypoint_num_gpus is not None
122
+ and self.entrypoint_num_gpus > 0,
123
+ self.entrypoint_memory is not None
124
+ and self.entrypoint_memory > 0,
125
+ self.entrypoint_resources not in [None, {}],
126
+ ]
127
+ ):
128
+ self.message += (
129
+ " It may be waiting for resources "
130
+ "(CPUs, GPUs, memory, custom resources) to become available."
131
+ )
132
+ if self.runtime_env not in [None, {}]:
133
+ self.message += (
134
+ " It may be waiting for the runtime environment to be set up."
135
+ )
136
+ elif self.status == JobStatus.RUNNING:
137
+ self.message = "Job is currently running."
138
+ elif self.status == JobStatus.STOPPED:
139
+ self.message = "Job was intentionally stopped."
140
+ elif self.status == JobStatus.SUCCEEDED:
141
+ self.message = "Job finished successfully."
142
+ elif self.status == JobStatus.FAILED:
143
+ self.message = "Job failed."
144
+
145
+ def to_json(self) -> Dict[str, Any]:
146
+ """Convert this object to a JSON-serializable dictionary.
147
+
148
+ Note that the runtime_env field is converted to a JSON-serialized string
149
+ and the field is renamed to runtime_env_json.
150
+
151
+ Returns:
152
+ A JSON-serializable dictionary representing the JobInfo object.
153
+ """
154
+
155
+ json_dict = asdict(self)
156
+
157
+ # Convert enum values to strings.
158
+ json_dict["status"] = str(json_dict["status"])
159
+
160
+ # Convert runtime_env to a JSON-serialized string.
161
+ if "runtime_env" in json_dict:
162
+ if json_dict["runtime_env"] is not None:
163
+ json_dict["runtime_env_json"] = json.dumps(json_dict["runtime_env"])
164
+ del json_dict["runtime_env"]
165
+
166
+ # Assert that the dictionary is JSON-serializable.
167
+ json.dumps(json_dict)
168
+
169
+ return json_dict
170
+
171
+ @classmethod
172
+ def from_json(cls, json_dict: Dict[str, Any]) -> None:
173
+ """Initialize this object from a JSON dictionary.
174
+
175
+ Note that the runtime_env_json field is converted to a dictionary and
176
+ the field is renamed to runtime_env.
177
+
178
+ Args:
179
+ json_dict: A JSON dictionary to use to initialize the JobInfo object.
180
+ """
181
+ # Convert enum values to enum objects.
182
+ json_dict["status"] = JobStatus(json_dict["status"])
183
+
184
+ # Convert runtime_env from a JSON-serialized string to a dictionary.
185
+ if "runtime_env_json" in json_dict:
186
+ if json_dict["runtime_env_json"] is not None:
187
+ json_dict["runtime_env"] = json.loads(json_dict["runtime_env_json"])
188
+ del json_dict["runtime_env_json"]
189
+
190
+ return cls(**json_dict)
191
+
192
+
193
+ class JobInfoStorageClient:
194
+ """
195
+ Interface to put and get job data from the Internal KV store.
196
+ """
197
+
198
+ # Please keep this format in sync with JobDataKey()
199
+ # in src/ray/gcs/gcs_server/gcs_job_manager.h.
200
+ JOB_DATA_KEY_PREFIX = f"{ray_constants.RAY_INTERNAL_NAMESPACE_PREFIX}job_info_"
201
+ JOB_DATA_KEY = f"{JOB_DATA_KEY_PREFIX}{{job_id}}"
202
+
203
+ def __init__(
204
+ self,
205
+ gcs_aio_client: GcsAioClient,
206
+ export_event_log_dir_root: Optional[str] = None,
207
+ ):
208
+ """
209
+ Initialize the JobInfoStorageClient which manages data in the internal KV store.
210
+ Export Submission Job events are written when the KV store is updated if
211
+ the feature flag is on and a export_event_log_dir_root is passed.
212
+ export_event_log_dir_root doesn't need to be passed if the caller
213
+ is not modifying data in the KV store.
214
+ """
215
+ self._gcs_aio_client = gcs_aio_client
216
+ self._export_submission_job_event_logger: logging.Logger = None
217
+ try:
218
+ if (
219
+ check_export_api_enabled(ExportEvent.SourceType.EXPORT_SUBMISSION_JOB)
220
+ and export_event_log_dir_root is not None
221
+ ):
222
+ self._export_submission_job_event_logger = get_export_event_logger(
223
+ ExportEvent.SourceType.EXPORT_SUBMISSION_JOB,
224
+ export_event_log_dir_root,
225
+ )
226
+ except Exception:
227
+ logger.exception(
228
+ "Unable to initialize export event logger so no export "
229
+ "events will be written."
230
+ )
231
+
232
+ async def put_info(
233
+ self, job_id: str, job_info: JobInfo, overwrite: bool = True
234
+ ) -> bool:
235
+ """Put job info to the internal kv store.
236
+
237
+ Args:
238
+ job_id: The job id.
239
+ job_info: The job info.
240
+ overwrite: Whether to overwrite the existing job info.
241
+
242
+ Returns:
243
+ True if a new key is added.
244
+ """
245
+ added_num = await self._gcs_aio_client.internal_kv_put(
246
+ self.JOB_DATA_KEY.format(job_id=job_id).encode(),
247
+ json.dumps(job_info.to_json()).encode(),
248
+ overwrite,
249
+ namespace=ray_constants.KV_NAMESPACE_JOB,
250
+ )
251
+ if added_num == 1 or overwrite:
252
+ # Write export event if data was updated in the KV store
253
+ try:
254
+ self._write_submission_job_export_event(job_id, job_info)
255
+ except Exception:
256
+ logger.exception("Error while writing job submission export event.")
257
+ return added_num == 1
258
+
259
+ def _write_submission_job_export_event(
260
+ self, job_id: str, job_info: JobInfo
261
+ ) -> None:
262
+ """
263
+ Write Submission Job export event if _export_submission_job_event_logger
264
+ exists. The logger will exist if the export API feature flag is enabled
265
+ and a log directory was passed to JobInfoStorageClient.
266
+ """
267
+ if not self._export_submission_job_event_logger:
268
+ return
269
+
270
+ status_value_descriptor = (
271
+ ExportSubmissionJobEventData.JobStatus.DESCRIPTOR.values_by_name.get(
272
+ job_info.status.name
273
+ )
274
+ )
275
+ if status_value_descriptor is None:
276
+ logger.error(
277
+ f"{job_info.status.name} is not a valid "
278
+ "ExportSubmissionJobEventData.JobStatus enum value. This event "
279
+ "will not be written."
280
+ )
281
+ return
282
+ job_status = status_value_descriptor.number
283
+ submission_event_data = ExportSubmissionJobEventData(
284
+ submission_job_id=job_id,
285
+ status=job_status,
286
+ entrypoint=job_info.entrypoint,
287
+ message=job_info.message,
288
+ metadata=job_info.metadata,
289
+ error_type=job_info.error_type,
290
+ start_time=job_info.start_time,
291
+ end_time=job_info.end_time,
292
+ runtime_env_json=json.dumps(job_info.runtime_env),
293
+ driver_agent_http_address=job_info.driver_agent_http_address,
294
+ driver_node_id=job_info.driver_node_id,
295
+ driver_exit_code=job_info.driver_exit_code,
296
+ )
297
+ self._export_submission_job_event_logger.send_event(submission_event_data)
298
+
299
+ async def get_info(self, job_id: str, timeout: int = 30) -> Optional[JobInfo]:
300
+ serialized_info = await self._gcs_aio_client.internal_kv_get(
301
+ self.JOB_DATA_KEY.format(job_id=job_id).encode(),
302
+ namespace=ray_constants.KV_NAMESPACE_JOB,
303
+ timeout=timeout,
304
+ )
305
+ if serialized_info is None:
306
+ return None
307
+ else:
308
+ return JobInfo.from_json(json.loads(serialized_info))
309
+
310
+ async def delete_info(self, job_id: str, timeout: int = 30):
311
+ await self._gcs_aio_client.internal_kv_del(
312
+ self.JOB_DATA_KEY.format(job_id=job_id).encode(),
313
+ False,
314
+ namespace=ray_constants.KV_NAMESPACE_JOB,
315
+ timeout=timeout,
316
+ )
317
+
318
+ async def put_status(
319
+ self,
320
+ job_id: str,
321
+ status: JobStatus,
322
+ message: Optional[str] = None,
323
+ driver_exit_code: Optional[int] = None,
324
+ jobinfo_replace_kwargs: Optional[Dict[str, Any]] = None,
325
+ ):
326
+ """Puts or updates job status. Sets end_time if status is terminal."""
327
+
328
+ old_info = await self.get_info(job_id)
329
+
330
+ if jobinfo_replace_kwargs is None:
331
+ jobinfo_replace_kwargs = dict()
332
+ jobinfo_replace_kwargs.update(
333
+ status=status, message=message, driver_exit_code=driver_exit_code
334
+ )
335
+ if old_info is not None:
336
+ if status != old_info.status and old_info.status.is_terminal():
337
+ assert False, "Attempted to change job status from a terminal state."
338
+ new_info = replace(old_info, **jobinfo_replace_kwargs)
339
+ else:
340
+ new_info = JobInfo(
341
+ entrypoint="Entrypoint not found.", **jobinfo_replace_kwargs
342
+ )
343
+
344
+ if status.is_terminal():
345
+ new_info.end_time = int(time.time() * 1000)
346
+
347
+ await self.put_info(job_id, new_info)
348
+
349
+ async def get_status(self, job_id: str) -> Optional[JobStatus]:
350
+ job_info = await self.get_info(job_id)
351
+ if job_info is None:
352
+ return None
353
+ else:
354
+ return job_info.status
355
+
356
+ async def get_all_jobs(self, timeout: int = 30) -> Dict[str, JobInfo]:
357
+ raw_job_ids_with_prefixes = await self._gcs_aio_client.internal_kv_keys(
358
+ self.JOB_DATA_KEY_PREFIX.encode(),
359
+ namespace=ray_constants.KV_NAMESPACE_JOB,
360
+ timeout=timeout,
361
+ )
362
+ job_ids_with_prefixes = [
363
+ job_id.decode() for job_id in raw_job_ids_with_prefixes
364
+ ]
365
+ job_ids = []
366
+ for job_id_with_prefix in job_ids_with_prefixes:
367
+ assert job_id_with_prefix.startswith(
368
+ self.JOB_DATA_KEY_PREFIX
369
+ ), "Unexpected format for internal_kv key for Job submission"
370
+ job_ids.append(job_id_with_prefix[len(self.JOB_DATA_KEY_PREFIX) :])
371
+
372
+ async def get_job_info(job_id: str):
373
+ job_info = await self.get_info(job_id, timeout)
374
+ return job_id, job_info
375
+
376
+ return {
377
+ job_id: job_info
378
+ for job_id, job_info in await asyncio.gather(
379
+ *[get_job_info(job_id) for job_id in job_ids]
380
+ )
381
+ }
382
+
383
+
384
+ def uri_to_http_components(package_uri: str) -> Tuple[str, str]:
385
+ suffix = Path(package_uri).suffix
386
+ if suffix not in {".zip", ".whl"}:
387
+ raise ValueError(f"package_uri ({package_uri}) does not end in .zip or .whl")
388
+ # We need to strip the <protocol>:// prefix to make it possible to pass
389
+ # the package_uri over HTTP.
390
+ protocol, package_name = parse_uri(package_uri)
391
+ return protocol.value, package_name
392
+
393
+
394
+ def http_uri_components_to_uri(protocol: str, package_name: str) -> str:
395
+ return f"{protocol}://{package_name}"
396
+
397
+
398
+ def validate_request_type(json_data: Dict[str, Any], request_type: dataclass) -> Any:
399
+ return request_type(**json_data)
400
+
401
+
402
+ @dataclass
403
+ class JobSubmitRequest:
404
+ # Command to start execution, ex: "python script.py"
405
+ entrypoint: str
406
+ # Optional submission_id to specify for the job. If the submission_id
407
+ # is not specified, one will be generated. If a job with the same
408
+ # submission_id already exists, it will be rejected.
409
+ submission_id: Optional[str] = None
410
+ # DEPRECATED. Use submission_id instead
411
+ job_id: Optional[str] = None
412
+ # Dict to setup execution environment.
413
+ runtime_env: Optional[Dict[str, Any]] = None
414
+ # Metadata to pass in to the JobConfig.
415
+ metadata: Optional[Dict[str, str]] = None
416
+ # The quantity of CPU cores to reserve for the execution
417
+ # of the entrypoint command, separately from any Ray tasks or actors
418
+ # that are created by it.
419
+ entrypoint_num_cpus: Optional[Union[int, float]] = None
420
+ # The quantity of GPUs to reserve for the execution
421
+ # of the entrypoint command, separately from any Ray tasks or actors
422
+ # that are created by it.
423
+ entrypoint_num_gpus: Optional[Union[int, float]] = None
424
+ # The amount of total available memory for workers requesting memory
425
+ # for the execution of the entrypoint command, separately from any Ray
426
+ # tasks or actors that are created by it.
427
+ entrypoint_memory: Optional[int] = None
428
+ # The quantity of various custom resources
429
+ # to reserve for the entrypoint command, separately from any Ray tasks
430
+ # or actors that are created by it.
431
+ entrypoint_resources: Optional[Dict[str, float]] = None
432
+
433
+ def __post_init__(self):
434
+ if not isinstance(self.entrypoint, str):
435
+ raise TypeError(f"entrypoint must be a string, got {type(self.entrypoint)}")
436
+
437
+ if self.submission_id is not None and not isinstance(self.submission_id, str):
438
+ raise TypeError(
439
+ "submission_id must be a string if provided, "
440
+ f"got {type(self.submission_id)}"
441
+ )
442
+
443
+ if self.job_id is not None and not isinstance(self.job_id, str):
444
+ raise TypeError(
445
+ "job_id must be a string if provided, " f"got {type(self.job_id)}"
446
+ )
447
+
448
+ if self.runtime_env is not None:
449
+ if not isinstance(self.runtime_env, dict):
450
+ raise TypeError(
451
+ f"runtime_env must be a dict, got {type(self.runtime_env)}"
452
+ )
453
+ else:
454
+ for k in self.runtime_env.keys():
455
+ if not isinstance(k, str):
456
+ raise TypeError(
457
+ f"runtime_env keys must be strings, got {type(k)}"
458
+ )
459
+
460
+ if self.metadata is not None:
461
+ if not isinstance(self.metadata, dict):
462
+ raise TypeError(f"metadata must be a dict, got {type(self.metadata)}")
463
+ else:
464
+ for k in self.metadata.keys():
465
+ if not isinstance(k, str):
466
+ raise TypeError(f"metadata keys must be strings, got {type(k)}")
467
+ for v in self.metadata.values():
468
+ if not isinstance(v, str):
469
+ raise TypeError(
470
+ f"metadata values must be strings, got {type(v)}"
471
+ )
472
+
473
+ if self.entrypoint_num_cpus is not None and not isinstance(
474
+ self.entrypoint_num_cpus, (int, float)
475
+ ):
476
+ raise TypeError(
477
+ "entrypoint_num_cpus must be a number, "
478
+ f"got {type(self.entrypoint_num_cpus)}"
479
+ )
480
+
481
+ if self.entrypoint_num_gpus is not None and not isinstance(
482
+ self.entrypoint_num_gpus, (int, float)
483
+ ):
484
+ raise TypeError(
485
+ "entrypoint_num_gpus must be a number, "
486
+ f"got {type(self.entrypoint_num_gpus)}"
487
+ )
488
+
489
+ if self.entrypoint_memory is not None and not isinstance(
490
+ self.entrypoint_memory, int
491
+ ):
492
+ raise TypeError(
493
+ "entrypoint_memory must be an integer, "
494
+ f"got {type(self.entrypoint_memory)}"
495
+ )
496
+
497
+ if self.entrypoint_resources is not None:
498
+ if not isinstance(self.entrypoint_resources, dict):
499
+ raise TypeError(
500
+ "entrypoint_resources must be a dict, "
501
+ f"got {type(self.entrypoint_resources)}"
502
+ )
503
+ else:
504
+ for k in self.entrypoint_resources.keys():
505
+ if not isinstance(k, str):
506
+ raise TypeError(
507
+ "entrypoint_resources keys must be strings, "
508
+ f"got {type(k)}"
509
+ )
510
+ for v in self.entrypoint_resources.values():
511
+ if not isinstance(v, (int, float)):
512
+ raise TypeError(
513
+ "entrypoint_resources values must be numbers, "
514
+ f"got {type(v)}"
515
+ )
516
+
517
+
518
+ @dataclass
519
+ class JobSubmitResponse:
520
+ # DEPRECATED: Use submission_id instead.
521
+ job_id: str
522
+ submission_id: str
523
+
524
+
525
+ @dataclass
526
+ class JobStopResponse:
527
+ stopped: bool
528
+
529
+
530
+ @dataclass
531
+ class JobDeleteResponse:
532
+ deleted: bool
533
+
534
+
535
+ # TODO(jiaodong): Support log streaming #19415
536
+ @dataclass
537
+ class JobLogsResponse:
538
+ logs: str
.venv/lib/python3.11/site-packages/ray/dashboard/modules/job/job_agent.py ADDED
@@ -0,0 +1,211 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import dataclasses
2
+ import json
3
+ import logging
4
+ import traceback
5
+
6
+ import aiohttp
7
+ from aiohttp.web import Request, Response
8
+
9
+ import ray
10
+ import ray.dashboard.optional_utils as optional_utils
11
+ import ray.dashboard.utils as dashboard_utils
12
+ from ray.dashboard.modules.job.common import (
13
+ JobDeleteResponse,
14
+ JobLogsResponse,
15
+ JobStopResponse,
16
+ JobSubmitRequest,
17
+ JobSubmitResponse,
18
+ )
19
+ from ray.dashboard.modules.job.job_manager import JobManager
20
+ from ray.dashboard.modules.job.pydantic_models import JobType
21
+ from ray.dashboard.modules.job.utils import find_job_by_ids, parse_and_validate_request
22
+
23
+ routes = optional_utils.DashboardAgentRouteTable
24
+ logger = logging.getLogger(__name__)
25
+
26
+
27
+ class JobAgent(dashboard_utils.DashboardAgentModule):
28
+ def __init__(self, dashboard_agent):
29
+ super().__init__(dashboard_agent)
30
+ self._job_manager = None
31
+
32
+ @routes.post("/api/job_agent/jobs/")
33
+ @optional_utils.deny_browser_requests()
34
+ @optional_utils.init_ray_and_catch_exceptions()
35
+ async def submit_job(self, req: Request) -> Response:
36
+ result = await parse_and_validate_request(req, JobSubmitRequest)
37
+ # Request parsing failed, returned with Response object.
38
+ if isinstance(result, Response):
39
+ return result
40
+ else:
41
+ submit_request = result
42
+
43
+ request_submission_id = submit_request.submission_id or submit_request.job_id
44
+ try:
45
+ ray._private.usage.usage_lib.record_library_usage("job_submission")
46
+ submission_id = await self.get_job_manager().submit_job(
47
+ entrypoint=submit_request.entrypoint,
48
+ submission_id=request_submission_id,
49
+ runtime_env=submit_request.runtime_env,
50
+ metadata=submit_request.metadata,
51
+ entrypoint_num_cpus=submit_request.entrypoint_num_cpus,
52
+ entrypoint_num_gpus=submit_request.entrypoint_num_gpus,
53
+ entrypoint_memory=submit_request.entrypoint_memory,
54
+ entrypoint_resources=submit_request.entrypoint_resources,
55
+ )
56
+
57
+ resp = JobSubmitResponse(job_id=submission_id, submission_id=submission_id)
58
+ except (TypeError, ValueError):
59
+ return Response(
60
+ text=traceback.format_exc(),
61
+ status=aiohttp.web.HTTPBadRequest.status_code,
62
+ )
63
+ except Exception:
64
+ return Response(
65
+ text=traceback.format_exc(),
66
+ status=aiohttp.web.HTTPInternalServerError.status_code,
67
+ )
68
+
69
+ return Response(
70
+ text=json.dumps(dataclasses.asdict(resp)),
71
+ content_type="application/json",
72
+ status=aiohttp.web.HTTPOk.status_code,
73
+ )
74
+
75
+ @routes.post("/api/job_agent/jobs/{job_or_submission_id}/stop")
76
+ @optional_utils.deny_browser_requests()
77
+ @optional_utils.init_ray_and_catch_exceptions()
78
+ async def stop_job(self, req: Request) -> Response:
79
+ job_or_submission_id = req.match_info["job_or_submission_id"]
80
+ job = await find_job_by_ids(
81
+ self._dashboard_agent.gcs_aio_client,
82
+ self.get_job_manager().job_info_client(),
83
+ job_or_submission_id,
84
+ )
85
+ if not job:
86
+ return Response(
87
+ text=f"Job {job_or_submission_id} does not exist",
88
+ status=aiohttp.web.HTTPNotFound.status_code,
89
+ )
90
+ if job.type is not JobType.SUBMISSION:
91
+ return Response(
92
+ text="Can only stop submission type jobs",
93
+ status=aiohttp.web.HTTPBadRequest.status_code,
94
+ )
95
+
96
+ try:
97
+ stopped = self.get_job_manager().stop_job(job.submission_id)
98
+ resp = JobStopResponse(stopped=stopped)
99
+ except Exception:
100
+ return Response(
101
+ text=traceback.format_exc(),
102
+ status=aiohttp.web.HTTPInternalServerError.status_code,
103
+ )
104
+
105
+ return Response(
106
+ text=json.dumps(dataclasses.asdict(resp)), content_type="application/json"
107
+ )
108
+
109
+ @routes.delete("/api/job_agent/jobs/{job_or_submission_id}")
110
+ @optional_utils.init_ray_and_catch_exceptions()
111
+ async def delete_job(self, req: Request) -> Response:
112
+ job_or_submission_id = req.match_info["job_or_submission_id"]
113
+ job = await find_job_by_ids(
114
+ self._dashboard_agent.gcs_aio_client,
115
+ self.get_job_manager().job_info_client(),
116
+ job_or_submission_id,
117
+ )
118
+ if not job:
119
+ return Response(
120
+ text=f"Job {job_or_submission_id} does not exist",
121
+ status=aiohttp.web.HTTPNotFound.status_code,
122
+ )
123
+ if job.type is not JobType.SUBMISSION:
124
+ return Response(
125
+ text="Can only delete submission type jobs",
126
+ status=aiohttp.web.HTTPBadRequest.status_code,
127
+ )
128
+
129
+ try:
130
+ deleted = await self.get_job_manager().delete_job(job.submission_id)
131
+ resp = JobDeleteResponse(deleted=deleted)
132
+ except Exception:
133
+ return Response(
134
+ text=traceback.format_exc(),
135
+ status=aiohttp.web.HTTPInternalServerError.status_code,
136
+ )
137
+
138
+ return Response(
139
+ text=json.dumps(dataclasses.asdict(resp)), content_type="application/json"
140
+ )
141
+
142
+ @routes.get("/api/job_agent/jobs/{job_or_submission_id}/logs")
143
+ @optional_utils.init_ray_and_catch_exceptions()
144
+ async def get_job_logs(self, req: Request) -> Response:
145
+ job_or_submission_id = req.match_info["job_or_submission_id"]
146
+ job = await find_job_by_ids(
147
+ self._dashboard_agent.gcs_aio_client,
148
+ self.get_job_manager().job_info_client(),
149
+ job_or_submission_id,
150
+ )
151
+ if not job:
152
+ return Response(
153
+ text=f"Job {job_or_submission_id} does not exist",
154
+ status=aiohttp.web.HTTPNotFound.status_code,
155
+ )
156
+
157
+ if job.type is not JobType.SUBMISSION:
158
+ return Response(
159
+ text="Can only get logs of submission type jobs",
160
+ status=aiohttp.web.HTTPBadRequest.status_code,
161
+ )
162
+
163
+ resp = JobLogsResponse(
164
+ logs=self.get_job_manager().get_job_logs(job.submission_id)
165
+ )
166
+ return Response(
167
+ text=json.dumps(dataclasses.asdict(resp)), content_type="application/json"
168
+ )
169
+
170
+ @routes.get("/api/job_agent/jobs/{job_or_submission_id}/logs/tail")
171
+ @optional_utils.init_ray_and_catch_exceptions()
172
+ async def tail_job_logs(self, req: Request) -> Response:
173
+ job_or_submission_id = req.match_info["job_or_submission_id"]
174
+ job = await find_job_by_ids(
175
+ self._dashboard_agent.gcs_aio_client,
176
+ self.get_job_manager().job_info_client(),
177
+ job_or_submission_id,
178
+ )
179
+ if not job:
180
+ return Response(
181
+ text=f"Job {job_or_submission_id} does not exist",
182
+ status=aiohttp.web.HTTPNotFound.status_code,
183
+ )
184
+
185
+ if job.type is not JobType.SUBMISSION:
186
+ return Response(
187
+ text="Can only get logs of submission type jobs",
188
+ status=aiohttp.web.HTTPBadRequest.status_code,
189
+ )
190
+
191
+ ws = aiohttp.web.WebSocketResponse()
192
+ await ws.prepare(req)
193
+
194
+ async for lines in self._job_manager.tail_job_logs(job.submission_id):
195
+ await ws.send_str(lines)
196
+
197
+ return ws
198
+
199
+ def get_job_manager(self):
200
+ if not self._job_manager:
201
+ self._job_manager = JobManager(
202
+ self._dashboard_agent.gcs_aio_client, self._dashboard_agent.log_dir
203
+ )
204
+ return self._job_manager
205
+
206
+ async def run(self, server):
207
+ pass
208
+
209
+ @staticmethod
210
+ def is_minimal_module():
211
+ return False
.venv/lib/python3.11/site-packages/ray/dashboard/modules/job/job_head.py ADDED
@@ -0,0 +1,587 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import dataclasses
3
+ import json
4
+ import logging
5
+ import traceback
6
+ from random import sample
7
+ from typing import AsyncIterator, List, Optional
8
+
9
+ import aiohttp.web
10
+ from aiohttp.client import ClientResponse
11
+ from aiohttp.web import Request, Response
12
+
13
+ import ray
14
+ import ray.dashboard.consts as dashboard_consts
15
+ import ray.dashboard.optional_utils as optional_utils
16
+ import ray.dashboard.utils as dashboard_utils
17
+ from ray._private.ray_constants import env_bool
18
+ from ray._private.runtime_env.packaging import (
19
+ package_exists,
20
+ pin_runtime_env_uri,
21
+ upload_package_to_gcs,
22
+ )
23
+ from ray._private.utils import get_or_create_event_loop
24
+ from ray.dashboard.datacenter import DataOrganizer
25
+ from ray.dashboard.modules.job.common import (
26
+ JobDeleteResponse,
27
+ JobInfoStorageClient,
28
+ JobLogsResponse,
29
+ JobStopResponse,
30
+ JobSubmitRequest,
31
+ JobSubmitResponse,
32
+ http_uri_components_to_uri,
33
+ )
34
+ from ray.dashboard.modules.job.pydantic_models import JobDetails, JobType
35
+ from ray.dashboard.modules.job.utils import (
36
+ find_job_by_ids,
37
+ get_driver_jobs,
38
+ get_head_node_id,
39
+ parse_and_validate_request,
40
+ )
41
+ from ray.dashboard.modules.version import CURRENT_VERSION, VersionResponse
42
+
43
+ logger = logging.getLogger(__name__)
44
+ logger.setLevel(logging.INFO)
45
+
46
+ routes = optional_utils.DashboardHeadRouteTable
47
+
48
+ # Feature flag controlling whether critical Ray Job control operations are performed
49
+ # exclusively by the Job Agent running on the Head node (or randomly sampled Worker one)
50
+ #
51
+ # NOTE: This flag serves as a temporary kill-switch and should be eventually cleaned up
52
+ RAY_JOB_AGENT_USE_HEAD_NODE_ONLY = env_bool("RAY_JOB_AGENT_USE_HEAD_NODE_ONLY", True)
53
+
54
+
55
+ class JobAgentSubmissionClient:
56
+ """A local client for submitting and interacting with jobs on a specific node
57
+ in the remote cluster.
58
+ Submits requests over HTTP to the job agent on the specific node using the REST API.
59
+ """
60
+
61
+ def __init__(
62
+ self,
63
+ dashboard_agent_address: str,
64
+ ):
65
+ self._agent_address = dashboard_agent_address
66
+ self._session = aiohttp.ClientSession()
67
+
68
+ async def _raise_error(self, resp: ClientResponse):
69
+ status = resp.status
70
+ error_text = await resp.text()
71
+ raise RuntimeError(f"Request failed with status code {status}: {error_text}.")
72
+
73
+ async def submit_job_internal(self, req: JobSubmitRequest) -> JobSubmitResponse:
74
+ logger.debug(f"Submitting job with submission_id={req.submission_id}.")
75
+
76
+ async with self._session.post(
77
+ f"{self._agent_address}/api/job_agent/jobs/", json=dataclasses.asdict(req)
78
+ ) as resp:
79
+ if resp.status == 200:
80
+ result_json = await resp.json()
81
+ return JobSubmitResponse(**result_json)
82
+ else:
83
+ await self._raise_error(resp)
84
+
85
+ async def stop_job_internal(self, job_id: str) -> JobStopResponse:
86
+ logger.debug(f"Stopping job with job_id={job_id}.")
87
+
88
+ async with self._session.post(
89
+ f"{self._agent_address}/api/job_agent/jobs/{job_id}/stop"
90
+ ) as resp:
91
+ if resp.status == 200:
92
+ result_json = await resp.json()
93
+ return JobStopResponse(**result_json)
94
+ else:
95
+ await self._raise_error(resp)
96
+
97
+ async def delete_job_internal(self, job_id: str) -> JobDeleteResponse:
98
+ logger.debug(f"Deleting job with job_id={job_id}.")
99
+
100
+ async with self._session.delete(
101
+ f"{self._agent_address}/api/job_agent/jobs/{job_id}"
102
+ ) as resp:
103
+ if resp.status == 200:
104
+ result_json = await resp.json()
105
+ return JobDeleteResponse(**result_json)
106
+ else:
107
+ await self._raise_error(resp)
108
+
109
+ async def get_job_logs_internal(self, job_id: str) -> JobLogsResponse:
110
+ async with self._session.get(
111
+ f"{self._agent_address}/api/job_agent/jobs/{job_id}/logs"
112
+ ) as resp:
113
+ if resp.status == 200:
114
+ result_json = await resp.json()
115
+ return JobLogsResponse(**result_json)
116
+ else:
117
+ await self._raise_error(resp)
118
+
119
+ async def tail_job_logs(self, job_id: str) -> AsyncIterator[str]:
120
+ """Get an iterator that follows the logs of a job."""
121
+ ws = await self._session.ws_connect(
122
+ f"{self._agent_address}/api/job_agent/jobs/{job_id}/logs/tail"
123
+ )
124
+
125
+ while True:
126
+ msg = await ws.receive()
127
+
128
+ if msg.type == aiohttp.WSMsgType.TEXT:
129
+ yield msg.data
130
+ elif msg.type == aiohttp.WSMsgType.CLOSED:
131
+ break
132
+ elif msg.type == aiohttp.WSMsgType.ERROR:
133
+ pass
134
+
135
+ async def close(self, ignore_error=True):
136
+ try:
137
+ await self._session.close()
138
+ except Exception:
139
+ if not ignore_error:
140
+ raise
141
+
142
+
143
+ class JobHead(dashboard_utils.DashboardHeadModule):
144
+ """Runs on the head node of a Ray cluster and handles Ray Jobs APIs.
145
+
146
+ NOTE(architkulkarni): Please keep this class in sync with the OpenAPI spec at
147
+ `doc/source/cluster/running-applications/job-submission/openapi.yml`.
148
+ We currently do not automatically check that the OpenAPI
149
+ spec is in sync with the implementation. If any changes are made to the
150
+ paths in the @route decorators or in the Responses returned by the
151
+ methods (or any nested fields in the Responses), you will need to find the
152
+ corresponding field of the OpenAPI yaml file and update it manually. Also,
153
+ bump the version number in the yaml file and in this class's `get_version`.
154
+ """
155
+
156
+ # Time that we sleep while tailing logs while waiting for
157
+ # the supervisor actor to start. We don't know which node
158
+ # to read the logs from until then.
159
+ WAIT_FOR_SUPERVISOR_ACTOR_INTERVAL_S = 1
160
+
161
+ def __init__(self, config: dashboard_utils.DashboardHeadModuleConfig):
162
+ super().__init__(config)
163
+ self._job_info_client = None
164
+
165
+ # It contains all `JobAgentSubmissionClient` that
166
+ # `JobHead` has ever used, and will not be deleted
167
+ # from it unless `JobAgentSubmissionClient` is no
168
+ # longer available (the corresponding agent process is dead)
169
+ self._agents = dict()
170
+
171
+ async def get_target_agent(self) -> Optional[JobAgentSubmissionClient]:
172
+ if RAY_JOB_AGENT_USE_HEAD_NODE_ONLY:
173
+ return await self._get_head_node_agent()
174
+
175
+ return await self._pick_random_agent()
176
+
177
+ async def _pick_random_agent(self) -> Optional[JobAgentSubmissionClient]:
178
+ """
179
+ Try to disperse as much as possible to select one of
180
+ the `CANDIDATE_AGENT_NUMBER` agents to solve requests.
181
+ the agents will not pop from `self._agents` unless
182
+ it's dead. Saved in `self._agents` is the agent that was
183
+ used before.
184
+ Strategy:
185
+ 1. if the number of `self._agents` has reached
186
+ `CANDIDATE_AGENT_NUMBER`, randomly select one agent from
187
+ `self._agents`.
188
+ 2. if not, randomly select one agent from all available agents,
189
+ it is possible that the selected one already exists in
190
+ `self._agents`.
191
+ """
192
+ # NOTE: Following call will block until there's at least 1 agent info
193
+ # being populated from GCS
194
+ agent_infos = await self._fetch_agent_infos()
195
+
196
+ # delete dead agents.
197
+ for dead_node in set(self._agents) - set(agent_infos):
198
+ client = self._agents.pop(dead_node)
199
+ await client.close()
200
+
201
+ if len(self._agents) >= dashboard_consts.CANDIDATE_AGENT_NUMBER:
202
+ node_id = sample(list(set(self._agents)), 1)[0]
203
+ return self._agents[node_id]
204
+ else:
205
+ # Randomly select one from among all agents, it is possible that
206
+ # the selected one already exists in `self._agents`
207
+ node_id = sample(sorted(agent_infos), 1)[0]
208
+ agent_info = agent_infos[node_id]
209
+
210
+ if node_id not in self._agents:
211
+ node_ip = agent_info["ipAddress"]
212
+ http_port = agent_info["httpPort"]
213
+ agent_http_address = f"http://{node_ip}:{http_port}"
214
+ self._agents[node_id] = JobAgentSubmissionClient(agent_http_address)
215
+
216
+ return self._agents[node_id]
217
+
218
+ async def _get_head_node_agent(self) -> Optional[JobAgentSubmissionClient]:
219
+ """Retrieves HTTP client for `JobAgent` running on the Head node"""
220
+
221
+ head_node_id = await get_head_node_id(self.gcs_aio_client)
222
+
223
+ if not head_node_id:
224
+ logger.warning("Head node id has not yet been persisted in GCS")
225
+ return None
226
+
227
+ if head_node_id not in self._agents:
228
+ agent_infos = await self._fetch_agent_infos(target_node_ids=[head_node_id])
229
+ if head_node_id not in agent_infos:
230
+ logger.error("Head node agent's information was not found")
231
+ return None
232
+
233
+ agent_info = agent_infos[head_node_id]
234
+
235
+ node_ip = agent_info["ipAddress"]
236
+ http_port = agent_info["httpPort"]
237
+ agent_http_address = f"http://{node_ip}:{http_port}"
238
+
239
+ self._agents[head_node_id] = JobAgentSubmissionClient(agent_http_address)
240
+
241
+ return self._agents[head_node_id]
242
+
243
+ @staticmethod
244
+ async def _fetch_agent_infos(target_node_ids: Optional[List[str]] = None):
245
+ """Fetches agent infos for nodes identified by provided node-ids (for all
246
+ nodes if not provided)
247
+
248
+ NOTE: This call will block until there's at least 1 valid agent info populated
249
+ """
250
+
251
+ while True:
252
+ raw_agent_infos = await DataOrganizer.get_agent_infos(target_node_ids)
253
+ # Filter out invalid agent infos with unset HTTP port
254
+ agent_infos = {
255
+ key: value
256
+ for key, value in raw_agent_infos.items()
257
+ if value.get("httpPort", -1) > 0
258
+ }
259
+
260
+ if len(agent_infos) > 0:
261
+ return agent_infos
262
+
263
+ await asyncio.sleep(dashboard_consts.TRY_TO_GET_AGENT_INFO_INTERVAL_SECONDS)
264
+
265
+ @routes.get("/api/version")
266
+ async def get_version(self, req: Request) -> Response:
267
+ # NOTE(edoakes): CURRENT_VERSION should be bumped and checked on the
268
+ # client when we have backwards-incompatible changes.
269
+ resp = VersionResponse(
270
+ version=CURRENT_VERSION,
271
+ ray_version=ray.__version__,
272
+ ray_commit=ray.__commit__,
273
+ session_name=self.session_name,
274
+ )
275
+ return Response(
276
+ text=json.dumps(dataclasses.asdict(resp)),
277
+ content_type="application/json",
278
+ status=aiohttp.web.HTTPOk.status_code,
279
+ )
280
+
281
+ @routes.get("/api/packages/{protocol}/{package_name}")
282
+ async def get_package(self, req: Request) -> Response:
283
+ package_uri = http_uri_components_to_uri(
284
+ protocol=req.match_info["protocol"],
285
+ package_name=req.match_info["package_name"],
286
+ )
287
+
288
+ logger.debug(f"Adding temporary reference to package {package_uri}.")
289
+ try:
290
+ pin_runtime_env_uri(package_uri)
291
+ except Exception:
292
+ return Response(
293
+ text=traceback.format_exc(),
294
+ status=aiohttp.web.HTTPInternalServerError.status_code,
295
+ )
296
+
297
+ if not package_exists(package_uri):
298
+ return Response(
299
+ text=f"Package {package_uri} does not exist",
300
+ status=aiohttp.web.HTTPNotFound.status_code,
301
+ )
302
+
303
+ return Response()
304
+
305
+ @routes.put("/api/packages/{protocol}/{package_name}")
306
+ async def upload_package(self, req: Request):
307
+ package_uri = http_uri_components_to_uri(
308
+ protocol=req.match_info["protocol"],
309
+ package_name=req.match_info["package_name"],
310
+ )
311
+ logger.info(f"Uploading package {package_uri} to the GCS.")
312
+ try:
313
+ data = await req.read()
314
+ await get_or_create_event_loop().run_in_executor(
315
+ None,
316
+ upload_package_to_gcs,
317
+ package_uri,
318
+ data,
319
+ )
320
+ except Exception:
321
+ return Response(
322
+ text=traceback.format_exc(),
323
+ status=aiohttp.web.HTTPInternalServerError.status_code,
324
+ )
325
+
326
+ return Response(status=aiohttp.web.HTTPOk.status_code)
327
+
328
+ @routes.post("/api/jobs/")
329
+ async def submit_job(self, req: Request) -> Response:
330
+ result = await parse_and_validate_request(req, JobSubmitRequest)
331
+ # Request parsing failed, returned with Response object.
332
+ if isinstance(result, Response):
333
+ return result
334
+ else:
335
+ submit_request: JobSubmitRequest = result
336
+
337
+ try:
338
+ job_agent_client = await asyncio.wait_for(
339
+ self.get_target_agent(),
340
+ timeout=dashboard_consts.WAIT_AVAILABLE_AGENT_TIMEOUT,
341
+ )
342
+ resp = await job_agent_client.submit_job_internal(submit_request)
343
+ except asyncio.TimeoutError:
344
+ return Response(
345
+ text="No available agent to submit job, please try again later.",
346
+ status=aiohttp.web.HTTPInternalServerError.status_code,
347
+ )
348
+ except (TypeError, ValueError):
349
+ return Response(
350
+ text=traceback.format_exc(),
351
+ status=aiohttp.web.HTTPBadRequest.status_code,
352
+ )
353
+ except Exception:
354
+ return Response(
355
+ text=traceback.format_exc(),
356
+ status=aiohttp.web.HTTPInternalServerError.status_code,
357
+ )
358
+
359
+ return Response(
360
+ text=json.dumps(dataclasses.asdict(resp)),
361
+ content_type="application/json",
362
+ status=aiohttp.web.HTTPOk.status_code,
363
+ )
364
+
365
+ @routes.post("/api/jobs/{job_or_submission_id}/stop")
366
+ async def stop_job(self, req: Request) -> Response:
367
+ job_or_submission_id = req.match_info["job_or_submission_id"]
368
+ job = await find_job_by_ids(
369
+ self.gcs_aio_client,
370
+ self._job_info_client,
371
+ job_or_submission_id,
372
+ )
373
+ if not job:
374
+ return Response(
375
+ text=f"Job {job_or_submission_id} does not exist",
376
+ status=aiohttp.web.HTTPNotFound.status_code,
377
+ )
378
+ if job.type is not JobType.SUBMISSION:
379
+ return Response(
380
+ text="Can only stop submission type jobs",
381
+ status=aiohttp.web.HTTPBadRequest.status_code,
382
+ )
383
+
384
+ try:
385
+ job_agent_client = await asyncio.wait_for(
386
+ self.get_target_agent(),
387
+ timeout=dashboard_consts.WAIT_AVAILABLE_AGENT_TIMEOUT,
388
+ )
389
+ resp = await job_agent_client.stop_job_internal(job.submission_id)
390
+ except Exception:
391
+ return Response(
392
+ text=traceback.format_exc(),
393
+ status=aiohttp.web.HTTPInternalServerError.status_code,
394
+ )
395
+
396
+ return Response(
397
+ text=json.dumps(dataclasses.asdict(resp)), content_type="application/json"
398
+ )
399
+
400
+ @routes.delete("/api/jobs/{job_or_submission_id}")
401
+ async def delete_job(self, req: Request) -> Response:
402
+ job_or_submission_id = req.match_info["job_or_submission_id"]
403
+ job = await find_job_by_ids(
404
+ self.gcs_aio_client,
405
+ self._job_info_client,
406
+ job_or_submission_id,
407
+ )
408
+ if not job:
409
+ return Response(
410
+ text=f"Job {job_or_submission_id} does not exist",
411
+ status=aiohttp.web.HTTPNotFound.status_code,
412
+ )
413
+ if job.type is not JobType.SUBMISSION:
414
+ return Response(
415
+ text="Can only delete submission type jobs",
416
+ status=aiohttp.web.HTTPBadRequest.status_code,
417
+ )
418
+
419
+ try:
420
+ job_agent_client = await asyncio.wait_for(
421
+ self.get_target_agent(),
422
+ timeout=dashboard_consts.WAIT_AVAILABLE_AGENT_TIMEOUT,
423
+ )
424
+ resp = await job_agent_client.delete_job_internal(job.submission_id)
425
+ except Exception:
426
+ return Response(
427
+ text=traceback.format_exc(),
428
+ status=aiohttp.web.HTTPInternalServerError.status_code,
429
+ )
430
+
431
+ return Response(
432
+ text=json.dumps(dataclasses.asdict(resp)), content_type="application/json"
433
+ )
434
+
435
+ @routes.get("/api/jobs/{job_or_submission_id}")
436
+ async def get_job_info(self, req: Request) -> Response:
437
+ job_or_submission_id = req.match_info["job_or_submission_id"]
438
+ job = await find_job_by_ids(
439
+ self.gcs_aio_client,
440
+ self._job_info_client,
441
+ job_or_submission_id,
442
+ )
443
+ if not job:
444
+ return Response(
445
+ text=f"Job {job_or_submission_id} does not exist",
446
+ status=aiohttp.web.HTTPNotFound.status_code,
447
+ )
448
+
449
+ return Response(
450
+ text=json.dumps(job.dict()),
451
+ content_type="application/json",
452
+ )
453
+
454
+ # TODO(rickyx): This endpoint's logic is also mirrored in state API's endpoint.
455
+ # We should eventually unify the backend logic (and keep the logic in sync before
456
+ # that).
457
+ @routes.get("/api/jobs/")
458
+ async def list_jobs(self, req: Request) -> Response:
459
+ (driver_jobs, submission_job_drivers), submission_jobs = await asyncio.gather(
460
+ get_driver_jobs(self.gcs_aio_client), self._job_info_client.get_all_jobs()
461
+ )
462
+
463
+ submission_jobs = [
464
+ JobDetails(
465
+ **dataclasses.asdict(job),
466
+ submission_id=submission_id,
467
+ job_id=submission_job_drivers.get(submission_id).id
468
+ if submission_id in submission_job_drivers
469
+ else None,
470
+ driver_info=submission_job_drivers.get(submission_id),
471
+ type=JobType.SUBMISSION,
472
+ )
473
+ for submission_id, job in submission_jobs.items()
474
+ ]
475
+ return Response(
476
+ text=json.dumps(
477
+ [
478
+ *[submission_job.dict() for submission_job in submission_jobs],
479
+ *[job_info.dict() for job_info in driver_jobs.values()],
480
+ ]
481
+ ),
482
+ content_type="application/json",
483
+ )
484
+
485
+ @routes.get("/api/jobs/{job_or_submission_id}/logs")
486
+ async def get_job_logs(self, req: Request) -> Response:
487
+ job_or_submission_id = req.match_info["job_or_submission_id"]
488
+ job = await find_job_by_ids(
489
+ self.gcs_aio_client,
490
+ self._job_info_client,
491
+ job_or_submission_id,
492
+ )
493
+ if not job:
494
+ return Response(
495
+ text=f"Job {job_or_submission_id} does not exist",
496
+ status=aiohttp.web.HTTPNotFound.status_code,
497
+ )
498
+
499
+ if job.type is not JobType.SUBMISSION:
500
+ return Response(
501
+ text="Can only get logs of submission type jobs",
502
+ status=aiohttp.web.HTTPBadRequest.status_code,
503
+ )
504
+
505
+ try:
506
+ job_agent_client = self.get_job_driver_agent_client(job)
507
+ payload = (
508
+ await job_agent_client.get_job_logs_internal(job.submission_id)
509
+ if job_agent_client
510
+ else JobLogsResponse("")
511
+ )
512
+ return Response(
513
+ text=json.dumps(dataclasses.asdict(payload)),
514
+ content_type="application/json",
515
+ )
516
+ except Exception:
517
+ return Response(
518
+ text=traceback.format_exc(),
519
+ status=aiohttp.web.HTTPInternalServerError.status_code,
520
+ )
521
+
522
+ @routes.get("/api/jobs/{job_or_submission_id}/logs/tail")
523
+ async def tail_job_logs(self, req: Request) -> Response:
524
+ job_or_submission_id = req.match_info["job_or_submission_id"]
525
+ job = await find_job_by_ids(
526
+ self.gcs_aio_client,
527
+ self._job_info_client,
528
+ job_or_submission_id,
529
+ )
530
+ if not job:
531
+ return Response(
532
+ text=f"Job {job_or_submission_id} does not exist",
533
+ status=aiohttp.web.HTTPNotFound.status_code,
534
+ )
535
+
536
+ if job.type is not JobType.SUBMISSION:
537
+ return Response(
538
+ text="Can only get logs of submission type jobs",
539
+ status=aiohttp.web.HTTPBadRequest.status_code,
540
+ )
541
+
542
+ ws = aiohttp.web.WebSocketResponse()
543
+ await ws.prepare(req)
544
+
545
+ driver_agent_http_address = None
546
+ while driver_agent_http_address is None:
547
+ job = await find_job_by_ids(
548
+ self.gcs_aio_client,
549
+ self._job_info_client,
550
+ job_or_submission_id,
551
+ )
552
+ driver_agent_http_address = job.driver_agent_http_address
553
+ status = job.status
554
+ if status.is_terminal() and driver_agent_http_address is None:
555
+ # Job exited before supervisor actor started.
556
+ return ws
557
+
558
+ await asyncio.sleep(self.WAIT_FOR_SUPERVISOR_ACTOR_INTERVAL_S)
559
+
560
+ job_agent_client = self.get_job_driver_agent_client(job)
561
+
562
+ async for lines in job_agent_client.tail_job_logs(job.submission_id):
563
+ await ws.send_str(lines)
564
+
565
+ return ws
566
+
567
+ def get_job_driver_agent_client(
568
+ self, job: JobDetails
569
+ ) -> Optional[JobAgentSubmissionClient]:
570
+ if job.driver_agent_http_address is None:
571
+ return None
572
+
573
+ driver_node_id = job.driver_node_id
574
+ if driver_node_id not in self._agents:
575
+ self._agents[driver_node_id] = JobAgentSubmissionClient(
576
+ job.driver_agent_http_address
577
+ )
578
+
579
+ return self._agents[driver_node_id]
580
+
581
+ async def run(self, server):
582
+ if not self._job_info_client:
583
+ self._job_info_client = JobInfoStorageClient(self.gcs_aio_client)
584
+
585
+ @staticmethod
586
+ def is_minimal_module():
587
+ return False
.venv/lib/python3.11/site-packages/ray/dashboard/modules/job/job_log_storage_client.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from collections import deque
3
+ from typing import AsyncIterator, List, Tuple
4
+
5
+ import ray
6
+ from ray.dashboard.modules.job.common import JOB_LOGS_PATH_TEMPLATE
7
+ from ray.dashboard.modules.job.utils import file_tail_iterator
8
+
9
+
10
+ class JobLogStorageClient:
11
+ """
12
+ Disk storage for stdout / stderr of driver script logs.
13
+ """
14
+
15
+ # Number of last N lines to put in job message upon failure.
16
+ NUM_LOG_LINES_ON_ERROR = 10
17
+ # Maximum number of characters to print out of the logs to avoid
18
+ # HUGE log outputs that bring down the api server
19
+ MAX_LOG_SIZE = 20000
20
+
21
+ def get_logs(self, job_id: str) -> str:
22
+ try:
23
+ with open(self.get_log_file_path(job_id), "r") as f:
24
+ return f.read()
25
+ except FileNotFoundError:
26
+ return ""
27
+
28
+ def tail_logs(self, job_id: str) -> AsyncIterator[List[str]]:
29
+ return file_tail_iterator(self.get_log_file_path(job_id))
30
+
31
+ async def get_last_n_log_lines(
32
+ self, job_id: str, num_log_lines=NUM_LOG_LINES_ON_ERROR
33
+ ) -> str:
34
+ """
35
+ Returns the last MAX_LOG_SIZE (20000) characters in the last
36
+ `num_log_lines` lines.
37
+
38
+ Args:
39
+ job_id: The id of the job whose logs we want to return
40
+ num_log_lines: The number of lines to return.
41
+ """
42
+ log_tail_deque = deque(maxlen=num_log_lines)
43
+ async for lines in self.tail_logs(job_id):
44
+ if lines is None:
45
+ break
46
+ else:
47
+ # log_tail_iter can return batches of lines at a time.
48
+ for line in lines:
49
+ log_tail_deque.append(line)
50
+
51
+ return "".join(log_tail_deque)[-self.MAX_LOG_SIZE :]
52
+
53
+ def get_log_file_path(self, job_id: str) -> Tuple[str, str]:
54
+ """
55
+ Get the file path to the logs of a given job. Example:
56
+ /tmp/ray/session_date/logs/job-driver-{job_id}.log
57
+ """
58
+ return os.path.join(
59
+ ray._private.worker._global_node.get_logs_dir_path(),
60
+ JOB_LOGS_PATH_TEMPLATE.format(submission_id=job_id),
61
+ )
.venv/lib/python3.11/site-packages/ray/dashboard/modules/job/job_manager.py ADDED
@@ -0,0 +1,640 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import copy
3
+ import logging
4
+ import os
5
+ import random
6
+ import string
7
+ import time
8
+ import traceback
9
+ from typing import Any, AsyncIterator, Dict, Optional, Union
10
+
11
+ import ray
12
+ import ray._private.ray_constants as ray_constants
13
+ from ray._private.event.event_logger import get_event_logger
14
+ from ray._private.gcs_utils import GcsAioClient
15
+ from ray._private.utils import run_background_task
16
+ from ray.actor import ActorHandle
17
+ from ray.core.generated.event_pb2 import Event
18
+ from ray.dashboard.consts import (
19
+ DEFAULT_JOB_START_TIMEOUT_SECONDS,
20
+ RAY_JOB_ALLOW_DRIVER_ON_WORKER_NODES_ENV_VAR,
21
+ RAY_JOB_START_TIMEOUT_SECONDS_ENV_VAR,
22
+ RAY_STREAM_RUNTIME_ENV_LOG_TO_JOB_DRIVER_LOG_ENV_VAR,
23
+ )
24
+ from ray.dashboard.modules.job.common import (
25
+ JOB_ACTOR_NAME_TEMPLATE,
26
+ SUPERVISOR_ACTOR_RAY_NAMESPACE,
27
+ JobInfo,
28
+ JobInfoStorageClient,
29
+ )
30
+ from ray.dashboard.modules.job.job_log_storage_client import JobLogStorageClient
31
+ from ray.dashboard.modules.job.job_supervisor import JobSupervisor
32
+ from ray.dashboard.modules.job.utils import get_head_node_id
33
+ from ray.dashboard.utils import close_logger_file_descriptor
34
+ from ray.exceptions import ActorUnschedulableError, RuntimeEnvSetupError
35
+ from ray.job_submission import JobStatus
36
+ from ray.runtime_env import RuntimeEnvConfig
37
+ from ray.util.scheduling_strategies import (
38
+ NodeAffinitySchedulingStrategy,
39
+ SchedulingStrategyT,
40
+ )
41
+
42
+ logger = logging.getLogger(__name__)
43
+
44
+
45
+ def generate_job_id() -> str:
46
+ """Returns a job_id of the form 'raysubmit_XYZ'.
47
+
48
+ Prefixed with 'raysubmit' to avoid confusion with Ray JobID (driver ID).
49
+ """
50
+ rand = random.SystemRandom()
51
+ possible_characters = list(
52
+ set(string.ascii_letters + string.digits)
53
+ - {"I", "l", "o", "O", "0"} # No confusing characters
54
+ )
55
+ id_part = "".join(rand.choices(possible_characters, k=16))
56
+ return f"raysubmit_{id_part}"
57
+
58
+
59
+ class JobManager:
60
+ """Provide python APIs for job submission and management.
61
+
62
+ It does not provide persistence, all info will be lost if the cluster
63
+ goes down.
64
+ """
65
+
66
+ # Time that we will sleep while tailing logs if no new log line is
67
+ # available.
68
+ LOG_TAIL_SLEEP_S = 1
69
+ JOB_MONITOR_LOOP_PERIOD_S = 1
70
+ WAIT_FOR_ACTOR_DEATH_TIMEOUT_S = 0.1
71
+
72
+ def __init__(self, gcs_aio_client: GcsAioClient, logs_dir: str):
73
+ self._gcs_aio_client = gcs_aio_client
74
+ self._logs_dir = logs_dir
75
+ self._job_info_client = JobInfoStorageClient(gcs_aio_client, logs_dir)
76
+ self._gcs_address = gcs_aio_client.address
77
+ self._cluster_id_hex = gcs_aio_client.cluster_id.hex()
78
+ self._log_client = JobLogStorageClient()
79
+ self._supervisor_actor_cls = ray.remote(JobSupervisor)
80
+ self.monitored_jobs = set()
81
+ try:
82
+ self.event_logger = get_event_logger(Event.SourceType.JOBS, logs_dir)
83
+ except Exception:
84
+ self.event_logger = None
85
+
86
+ self._recover_running_jobs_event = asyncio.Event()
87
+ run_background_task(self._recover_running_jobs())
88
+
89
+ def _get_job_driver_logger(self, job_id: str) -> logging.Logger:
90
+ """Return job driver logger to log messages to the job driver log file.
91
+
92
+ If this function is called for the first time, configure the logger.
93
+ """
94
+ job_driver_logger = logging.getLogger(f"{__name__}.driver-{job_id}")
95
+
96
+ # Configure the logger if it's not already configured.
97
+ if not job_driver_logger.handlers:
98
+ job_driver_log_path = self._log_client.get_log_file_path(job_id)
99
+ job_driver_handler = logging.FileHandler(job_driver_log_path)
100
+ job_driver_formatter = logging.Formatter(ray_constants.LOGGER_FORMAT)
101
+ job_driver_handler.setFormatter(job_driver_formatter)
102
+ job_driver_logger.addHandler(job_driver_handler)
103
+
104
+ return job_driver_logger
105
+
106
+ async def _recover_running_jobs(self):
107
+ """Recovers all running jobs from the status client.
108
+
109
+ For each job, we will spawn a coroutine to monitor it.
110
+ Each will be added to self._running_jobs and reconciled.
111
+ """
112
+ try:
113
+ all_jobs = await self._job_info_client.get_all_jobs()
114
+ for job_id, job_info in all_jobs.items():
115
+ if not job_info.status.is_terminal():
116
+ run_background_task(self._monitor_job(job_id))
117
+ finally:
118
+ # This event is awaited in `submit_job` to avoid race conditions between
119
+ # recovery and new job submission, so it must always get set even if there
120
+ # are exceptions.
121
+ self._recover_running_jobs_event.set()
122
+
123
+ def _get_actor_for_job(self, job_id: str) -> Optional[ActorHandle]:
124
+ try:
125
+ return ray.get_actor(
126
+ JOB_ACTOR_NAME_TEMPLATE.format(job_id=job_id),
127
+ namespace=SUPERVISOR_ACTOR_RAY_NAMESPACE,
128
+ )
129
+ except ValueError: # Ray returns ValueError for nonexistent actor.
130
+ return None
131
+
132
+ async def _monitor_job(
133
+ self, job_id: str, job_supervisor: Optional[ActorHandle] = None
134
+ ):
135
+ """Monitors the specified job until it enters a terminal state.
136
+
137
+ This is necessary because we need to handle the case where the
138
+ JobSupervisor dies unexpectedly.
139
+ """
140
+ if job_id in self.monitored_jobs:
141
+ logger.debug(f"Job {job_id} is already being monitored.")
142
+ return
143
+
144
+ self.monitored_jobs.add(job_id)
145
+ try:
146
+ await self._monitor_job_internal(job_id, job_supervisor)
147
+ finally:
148
+ self.monitored_jobs.remove(job_id)
149
+
150
+ async def _monitor_job_internal(
151
+ self, job_id: str, job_supervisor: Optional[ActorHandle] = None
152
+ ):
153
+ timeout = float(
154
+ os.environ.get(
155
+ RAY_JOB_START_TIMEOUT_SECONDS_ENV_VAR,
156
+ DEFAULT_JOB_START_TIMEOUT_SECONDS,
157
+ )
158
+ )
159
+
160
+ is_alive = True
161
+
162
+ while is_alive:
163
+ try:
164
+ job_status = await self._job_info_client.get_status(job_id)
165
+ if job_status == JobStatus.PENDING:
166
+ # Compare the current time with the job start time.
167
+ # If the job is still pending, we will set the status
168
+ # to FAILED.
169
+ job_info = await self._job_info_client.get_info(job_id)
170
+
171
+ if time.time() - job_info.start_time / 1000 > timeout:
172
+ err_msg = (
173
+ "Job supervisor actor failed to start within "
174
+ f"{timeout} seconds. This timeout can be "
175
+ f"configured by setting the environment "
176
+ f"variable {RAY_JOB_START_TIMEOUT_SECONDS_ENV_VAR}."
177
+ )
178
+ resources_specified = (
179
+ (
180
+ job_info.entrypoint_num_cpus is not None
181
+ and job_info.entrypoint_num_cpus > 0
182
+ )
183
+ or (
184
+ job_info.entrypoint_num_gpus is not None
185
+ and job_info.entrypoint_num_gpus > 0
186
+ )
187
+ or (
188
+ job_info.entrypoint_memory is not None
189
+ and job_info.entrypoint_memory > 0
190
+ )
191
+ or (
192
+ job_info.entrypoint_resources is not None
193
+ and len(job_info.entrypoint_resources) > 0
194
+ )
195
+ )
196
+ if resources_specified:
197
+ err_msg += (
198
+ " This may be because the job entrypoint's specified "
199
+ "resources (entrypoint_num_cpus, entrypoint_num_gpus, "
200
+ "entrypoint_resources, entrypoint_memory)"
201
+ "aren't available on the cluster."
202
+ " Try checking the cluster's available resources with "
203
+ "`ray status` and specifying fewer resources for the "
204
+ "job entrypoint."
205
+ )
206
+ await self._job_info_client.put_status(
207
+ job_id,
208
+ JobStatus.FAILED,
209
+ message=err_msg,
210
+ )
211
+ is_alive = False
212
+ logger.error(err_msg)
213
+ continue
214
+
215
+ if job_supervisor is None:
216
+ job_supervisor = self._get_actor_for_job(job_id)
217
+
218
+ if job_supervisor is None:
219
+ if job_status == JobStatus.PENDING:
220
+ # Maybe the job supervisor actor is not created yet.
221
+ # We will wait for the next loop.
222
+ continue
223
+ else:
224
+ # The job supervisor actor is not created, but the job
225
+ # status is not PENDING. This means the job supervisor
226
+ # actor is not created due to some unexpected errors.
227
+ # We will set the job status to FAILED.
228
+ logger.error(f"Failed to get job supervisor for job {job_id}.")
229
+ await self._job_info_client.put_status(
230
+ job_id,
231
+ JobStatus.FAILED,
232
+ message=(
233
+ "Unexpected error occurred: "
234
+ "failed to get job supervisor."
235
+ ),
236
+ )
237
+ is_alive = False
238
+ continue
239
+
240
+ await job_supervisor.ping.remote()
241
+
242
+ await asyncio.sleep(self.JOB_MONITOR_LOOP_PERIOD_S)
243
+ except Exception as e:
244
+ is_alive = False
245
+ job_status = await self._job_info_client.get_status(job_id)
246
+ job_error_message = None
247
+ if job_status == JobStatus.FAILED:
248
+ job_error_message = (
249
+ "See more details from the dashboard "
250
+ "`Job` page or the state API `ray list jobs`."
251
+ )
252
+
253
+ job_error_message = ""
254
+ if job_status.is_terminal():
255
+ # If the job is already in a terminal state, then the actor
256
+ # exiting is expected.
257
+ pass
258
+ elif isinstance(e, RuntimeEnvSetupError):
259
+ logger.info(f"Failed to set up runtime_env for job {job_id}.")
260
+ job_error_message = f"runtime_env setup failed: {e}"
261
+ job_status = JobStatus.FAILED
262
+ await self._job_info_client.put_status(
263
+ job_id,
264
+ job_status,
265
+ message=job_error_message,
266
+ )
267
+ elif isinstance(e, ActorUnschedulableError):
268
+ logger.info(
269
+ f"Failed to schedule job {job_id} because the supervisor actor "
270
+ f"could not be scheduled: {e}"
271
+ )
272
+ job_error_message = (
273
+ f"Job supervisor actor could not be scheduled: {e}"
274
+ )
275
+ await self._job_info_client.put_status(
276
+ job_id,
277
+ JobStatus.FAILED,
278
+ message=job_error_message,
279
+ )
280
+ else:
281
+ logger.warning(
282
+ f"Job supervisor for job {job_id} failed unexpectedly: {e}."
283
+ )
284
+ job_error_message = f"Unexpected error occurred: {e}"
285
+ job_status = JobStatus.FAILED
286
+ await self._job_info_client.put_status(
287
+ job_id,
288
+ job_status,
289
+ message=job_error_message,
290
+ )
291
+
292
+ # Log error message to the job driver file for easy access.
293
+ if job_error_message:
294
+ log_path = self._log_client.get_log_file_path(job_id)
295
+ os.makedirs(os.path.dirname(log_path), exist_ok=True)
296
+ with open(log_path, "a") as log_file:
297
+ log_file.write(job_error_message)
298
+
299
+ # Log events
300
+ if self.event_logger:
301
+ event_log = (
302
+ f"Completed a ray job {job_id} with a status {job_status}."
303
+ )
304
+ if job_error_message:
305
+ event_log += f" {job_error_message}"
306
+ self.event_logger.error(event_log, submission_id=job_id)
307
+ else:
308
+ self.event_logger.info(event_log, submission_id=job_id)
309
+
310
+ # Kill the actor defensively to avoid leaking actors in unexpected error cases.
311
+ if job_supervisor is not None:
312
+ ray.kill(job_supervisor, no_restart=True)
313
+
314
+ def _handle_supervisor_startup(self, job_id: str, result: Optional[Exception]):
315
+ """Handle the result of starting a job supervisor actor.
316
+
317
+ If started successfully, result should be None. Otherwise it should be
318
+ an Exception.
319
+
320
+ On failure, the job will be marked failed with a relevant error
321
+ message.
322
+ """
323
+ if result is None:
324
+ return
325
+
326
+ def _get_supervisor_runtime_env(
327
+ self,
328
+ user_runtime_env: Dict[str, Any],
329
+ submission_id: str,
330
+ resources_specified: bool = False,
331
+ ) -> Dict[str, Any]:
332
+ """Configure and return the runtime_env for the supervisor actor.
333
+
334
+ Args:
335
+ user_runtime_env: The runtime_env specified by the user.
336
+ resources_specified: Whether the user specified resources in the
337
+ submit_job() call. If so, we will skip the workaround introduced
338
+ in #24546 for GPU detection and just use the user's resource
339
+ requests, so that the behavior matches that of the user specifying
340
+ resources for any other actor.
341
+
342
+ Returns:
343
+ The runtime_env for the supervisor actor.
344
+ """
345
+ # Make a copy to avoid mutating passed runtime_env.
346
+ runtime_env = (
347
+ copy.deepcopy(user_runtime_env) if user_runtime_env is not None else {}
348
+ )
349
+
350
+ # NOTE(edoakes): Can't use .get(, {}) here because we need to handle the case
351
+ # where env_vars is explicitly set to `None`.
352
+ env_vars = runtime_env.get("env_vars")
353
+ if env_vars is None:
354
+ env_vars = {}
355
+
356
+ env_vars[ray_constants.RAY_WORKER_NICENESS] = "0"
357
+
358
+ if not resources_specified:
359
+ # Don't set CUDA_VISIBLE_DEVICES for the supervisor actor so the
360
+ # driver can use GPUs if it wants to. This will be removed from
361
+ # the driver's runtime_env so it isn't inherited by tasks & actors.
362
+ env_vars[ray_constants.NOSET_CUDA_VISIBLE_DEVICES_ENV_VAR] = "1"
363
+ runtime_env["env_vars"] = env_vars
364
+
365
+ if os.getenv(RAY_STREAM_RUNTIME_ENV_LOG_TO_JOB_DRIVER_LOG_ENV_VAR, "0") == "1":
366
+ config = runtime_env.get("config")
367
+ # Empty fields may be set to None, so we need to check for None explicitly.
368
+ if config is None:
369
+ config = RuntimeEnvConfig()
370
+ config["log_files"] = [self._log_client.get_log_file_path(submission_id)]
371
+ runtime_env["config"] = config
372
+ return runtime_env
373
+
374
+ async def _get_scheduling_strategy(
375
+ self, resources_specified: bool
376
+ ) -> SchedulingStrategyT:
377
+ """Get the scheduling strategy for the job.
378
+
379
+ If resources_specified is true, or if the environment variable is set to
380
+ allow the job to run on worker nodes, we will use Ray's default actor
381
+ placement strategy. Otherwise, we will force the job to use the head node.
382
+
383
+ Args:
384
+ resources_specified: Whether the job specified any resources
385
+ (CPUs, GPUs, or custom resources).
386
+
387
+ Returns:
388
+ The scheduling strategy to use for the job.
389
+ """
390
+ if resources_specified:
391
+ return "DEFAULT"
392
+
393
+ if os.environ.get(RAY_JOB_ALLOW_DRIVER_ON_WORKER_NODES_ENV_VAR, "0") == "1":
394
+ logger.info(
395
+ f"{RAY_JOB_ALLOW_DRIVER_ON_WORKER_NODES_ENV_VAR} was set to 1. "
396
+ "Using Ray's default actor scheduling strategy for the job "
397
+ "driver instead of running it on the head node."
398
+ )
399
+ return "DEFAULT"
400
+
401
+ # If the user did not specify any resources or set the driver on worker nodes
402
+ # env var, we will run the driver on the head node.
403
+
404
+ head_node_id = await get_head_node_id(self._gcs_aio_client)
405
+ if head_node_id is None:
406
+ logger.info(
407
+ "Head node ID not found in GCS. Using Ray's default actor "
408
+ "scheduling strategy for the job driver instead of running "
409
+ "it on the head node."
410
+ )
411
+ scheduling_strategy = "DEFAULT"
412
+ else:
413
+ logger.info(
414
+ "Head node ID found in GCS; scheduling job driver on "
415
+ f"head node {head_node_id}"
416
+ )
417
+ scheduling_strategy = NodeAffinitySchedulingStrategy(
418
+ node_id=head_node_id, soft=False
419
+ )
420
+ return scheduling_strategy
421
+
422
+ async def submit_job(
423
+ self,
424
+ *,
425
+ entrypoint: str,
426
+ submission_id: Optional[str] = None,
427
+ runtime_env: Optional[Dict[str, Any]] = None,
428
+ metadata: Optional[Dict[str, str]] = None,
429
+ entrypoint_num_cpus: Optional[Union[int, float]] = None,
430
+ entrypoint_num_gpus: Optional[Union[int, float]] = None,
431
+ entrypoint_memory: Optional[int] = None,
432
+ entrypoint_resources: Optional[Dict[str, float]] = None,
433
+ _start_signal_actor: Optional[ActorHandle] = None,
434
+ ) -> str:
435
+ """
436
+ Job execution happens asynchronously.
437
+
438
+ 1) Generate a new unique id for this job submission, each call of this
439
+ method assumes they're independent submission with its own new
440
+ ID, job supervisor actor, and child process.
441
+ 2) Create new detached actor with same runtime_env as job spec
442
+
443
+ Actual setting up runtime_env, subprocess group, driver command
444
+ execution, subprocess cleaning up and running status update to GCS
445
+ is all handled by job supervisor actor.
446
+
447
+ Args:
448
+ entrypoint: Driver command to execute in subprocess shell.
449
+ Represents the entrypoint to start user application.
450
+ runtime_env: Runtime environment used to execute driver command,
451
+ which could contain its own ray.init() to configure runtime
452
+ env at ray cluster, task and actor level.
453
+ metadata: Support passing arbitrary data to driver command in
454
+ case needed.
455
+ entrypoint_num_cpus: The quantity of CPU cores to reserve for the execution
456
+ of the entrypoint command, separately from any tasks or actors launched
457
+ by it. Defaults to 0.
458
+ entrypoint_num_gpus: The quantity of GPUs to reserve for
459
+ the entrypoint command, separately from any tasks or actors launched
460
+ by it. Defaults to 0.
461
+ entrypoint_memory: The amount of total available memory for workers
462
+ requesting memory the entrypoint command, separately from any tasks
463
+ or actors launched by it. Defaults to 0.
464
+ entrypoint_resources: The quantity of various custom resources
465
+ to reserve for the entrypoint command, separately from any tasks or
466
+ actors launched by it.
467
+ _start_signal_actor: Used in testing only to capture state
468
+ transitions between PENDING -> RUNNING. Regular user shouldn't
469
+ need this.
470
+
471
+ Returns:
472
+ job_id: Generated uuid for further job management. Only valid
473
+ within the same ray cluster.
474
+ """
475
+ if entrypoint_num_cpus is None:
476
+ entrypoint_num_cpus = 0
477
+ if entrypoint_num_gpus is None:
478
+ entrypoint_num_gpus = 0
479
+ if entrypoint_memory is None:
480
+ entrypoint_memory = 0
481
+ if submission_id is None:
482
+ submission_id = generate_job_id()
483
+
484
+ # Wait for `_recover_running_jobs` to run before accepting submissions to
485
+ # avoid duplicate monitoring of the same job.
486
+ await self._recover_running_jobs_event.wait()
487
+
488
+ logger.info(f"Starting job with submission_id: {submission_id}")
489
+ job_info = JobInfo(
490
+ entrypoint=entrypoint,
491
+ status=JobStatus.PENDING,
492
+ start_time=int(time.time() * 1000),
493
+ metadata=metadata,
494
+ runtime_env=runtime_env,
495
+ entrypoint_num_cpus=entrypoint_num_cpus,
496
+ entrypoint_num_gpus=entrypoint_num_gpus,
497
+ entrypoint_memory=entrypoint_memory,
498
+ entrypoint_resources=entrypoint_resources,
499
+ )
500
+ new_key_added = await self._job_info_client.put_info(
501
+ submission_id, job_info, overwrite=False
502
+ )
503
+ if not new_key_added:
504
+ raise ValueError(
505
+ f"Job with submission_id {submission_id} already exists. "
506
+ "Please use a different submission_id."
507
+ )
508
+
509
+ driver_logger = self._get_job_driver_logger(submission_id)
510
+ # Wait for the actor to start up asynchronously so this call always
511
+ # returns immediately and we can catch errors with the actor starting
512
+ # up.
513
+ try:
514
+ resources_specified = any(
515
+ [
516
+ entrypoint_num_cpus is not None and entrypoint_num_cpus > 0,
517
+ entrypoint_num_gpus is not None and entrypoint_num_gpus > 0,
518
+ entrypoint_memory is not None and entrypoint_memory > 0,
519
+ entrypoint_resources not in [None, {}],
520
+ ]
521
+ )
522
+ scheduling_strategy = await self._get_scheduling_strategy(
523
+ resources_specified
524
+ )
525
+ if self.event_logger:
526
+ self.event_logger.info(
527
+ f"Started a ray job {submission_id}.", submission_id=submission_id
528
+ )
529
+
530
+ driver_logger.info("Runtime env is setting up.")
531
+ supervisor = self._supervisor_actor_cls.options(
532
+ lifetime="detached",
533
+ name=JOB_ACTOR_NAME_TEMPLATE.format(job_id=submission_id),
534
+ num_cpus=entrypoint_num_cpus,
535
+ num_gpus=entrypoint_num_gpus,
536
+ memory=entrypoint_memory,
537
+ resources=entrypoint_resources,
538
+ scheduling_strategy=scheduling_strategy,
539
+ runtime_env=self._get_supervisor_runtime_env(
540
+ runtime_env, submission_id, resources_specified
541
+ ),
542
+ namespace=SUPERVISOR_ACTOR_RAY_NAMESPACE,
543
+ ).remote(
544
+ submission_id,
545
+ entrypoint,
546
+ metadata or {},
547
+ self._gcs_address,
548
+ self._cluster_id_hex,
549
+ self._logs_dir,
550
+ )
551
+ supervisor.run.remote(
552
+ _start_signal_actor=_start_signal_actor,
553
+ resources_specified=resources_specified,
554
+ )
555
+
556
+ # Monitor the job in the background so we can detect errors without
557
+ # requiring a client to poll.
558
+ run_background_task(
559
+ self._monitor_job(submission_id, job_supervisor=supervisor)
560
+ )
561
+ except Exception as e:
562
+ tb_str = traceback.format_exc()
563
+ driver_logger.warning(
564
+ f"Failed to start supervisor actor for job {submission_id}: '{e}'"
565
+ f". Full traceback:\n{tb_str}"
566
+ )
567
+ await self._job_info_client.put_status(
568
+ submission_id,
569
+ JobStatus.FAILED,
570
+ message=(
571
+ f"Failed to start supervisor actor {submission_id}: '{e}'"
572
+ f". Full traceback:\n{tb_str}"
573
+ ),
574
+ )
575
+ finally:
576
+ close_logger_file_descriptor(driver_logger)
577
+
578
+ return submission_id
579
+
580
+ def stop_job(self, job_id) -> bool:
581
+ """Request a job to exit, fire and forget.
582
+
583
+ Returns whether or not the job was running.
584
+ """
585
+ job_supervisor_actor = self._get_actor_for_job(job_id)
586
+ if job_supervisor_actor is not None:
587
+ # Actor is still alive, signal it to stop the driver, fire and
588
+ # forget
589
+ job_supervisor_actor.stop.remote()
590
+ return True
591
+ else:
592
+ return False
593
+
594
+ async def delete_job(self, job_id):
595
+ """Delete a job's info and metadata from the cluster."""
596
+ job_status = await self._job_info_client.get_status(job_id)
597
+
598
+ if job_status is None or not job_status.is_terminal():
599
+ raise RuntimeError(
600
+ f"Attempted to delete job '{job_id}', "
601
+ f"but it is in a non-terminal state {job_status}."
602
+ )
603
+
604
+ await self._job_info_client.delete_info(job_id)
605
+ return True
606
+
607
+ def job_info_client(self) -> JobInfoStorageClient:
608
+ return self._job_info_client
609
+
610
+ async def get_job_status(self, job_id: str) -> Optional[JobStatus]:
611
+ """Get latest status of a job."""
612
+ return await self._job_info_client.get_status(job_id)
613
+
614
+ async def get_job_info(self, job_id: str) -> Optional[JobInfo]:
615
+ """Get latest info of a job."""
616
+ return await self._job_info_client.get_info(job_id)
617
+
618
+ async def list_jobs(self) -> Dict[str, JobInfo]:
619
+ """Get info for all jobs."""
620
+ return await self._job_info_client.get_all_jobs()
621
+
622
+ def get_job_logs(self, job_id: str) -> str:
623
+ """Get all logs produced by a job."""
624
+ return self._log_client.get_logs(job_id)
625
+
626
+ async def tail_job_logs(self, job_id: str) -> AsyncIterator[str]:
627
+ """Return an iterator following the logs of a job."""
628
+ if await self.get_job_status(job_id) is None:
629
+ raise RuntimeError(f"Job '{job_id}' does not exist.")
630
+
631
+ async for lines in self._log_client.tail_logs(job_id):
632
+ if lines is None:
633
+ # Return if the job has exited and there are no new log lines.
634
+ status = await self.get_job_status(job_id)
635
+ if status.is_terminal():
636
+ return
637
+
638
+ await asyncio.sleep(self.LOG_TAIL_SLEEP_S)
639
+ else:
640
+ yield "".join(lines)
.venv/lib/python3.11/site-packages/ray/dashboard/modules/job/job_supervisor.py ADDED
@@ -0,0 +1,477 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import json
3
+ import logging
4
+ import os
5
+ import signal
6
+ import subprocess
7
+ import sys
8
+ import traceback
9
+ from asyncio.tasks import FIRST_COMPLETED
10
+ from typing import Any, Dict, List, Optional
11
+
12
+ import ray
13
+ import ray._private.ray_constants as ray_constants
14
+ from ray._private.gcs_utils import GcsAioClient
15
+ from ray._private.ray_logging.filters import CoreContextFilter
16
+ from ray._private.ray_logging.formatters import JSONFormatter, TextFormatter
17
+ from ray._private.runtime_env.constants import RAY_JOB_CONFIG_JSON_ENV_VAR
18
+ from ray._private.utils import remove_ray_internal_flags_from_env
19
+ from ray.actor import ActorHandle
20
+ from ray.dashboard.modules.job.common import (
21
+ JOB_ID_METADATA_KEY,
22
+ JOB_NAME_METADATA_KEY,
23
+ JobInfoStorageClient,
24
+ )
25
+ from ray.dashboard.modules.job.job_log_storage_client import JobLogStorageClient
26
+ from ray.job_submission import JobStatus
27
+
28
+ import psutil
29
+
30
+ # asyncio python version compatibility
31
+ try:
32
+ create_task = asyncio.create_task
33
+ except AttributeError:
34
+ create_task = asyncio.ensure_future
35
+
36
+ # Windows requires additional packages for proper process control.
37
+ if sys.platform == "win32":
38
+ try:
39
+ import win32api
40
+ import win32con
41
+ import win32job
42
+ except (ModuleNotFoundError, ImportError) as e:
43
+ win32api = None
44
+ win32con = None
45
+ win32job = None
46
+
47
+ logger = logging.getLogger(__name__)
48
+ logger.warning(
49
+ "Failed to Import win32api. For best usage experience run "
50
+ f"'conda install pywin32'. Import error: {e}"
51
+ )
52
+
53
+
54
+ class JobSupervisor:
55
+ """
56
+ Ray actor created by JobManager for each submitted job, responsible to
57
+ setup runtime_env, execute given shell command in subprocess, update job
58
+ status, persist job logs and manage subprocess group cleaning.
59
+
60
+ One job supervisor actor maps to one subprocess, for one job_id.
61
+ Job supervisor actor should fate share with subprocess it created.
62
+ """
63
+
64
+ DEFAULT_RAY_JOB_STOP_WAIT_TIME_S = 3
65
+ SUBPROCESS_POLL_PERIOD_S = 0.1
66
+ VALID_STOP_SIGNALS = ["SIGINT", "SIGTERM"]
67
+
68
+ def __init__(
69
+ self,
70
+ job_id: str,
71
+ entrypoint: str,
72
+ user_metadata: Dict[str, str],
73
+ gcs_address: str,
74
+ cluster_id_hex: str,
75
+ logs_dir: Optional[str] = None,
76
+ ):
77
+ self._job_id = job_id
78
+ gcs_aio_client = GcsAioClient(address=gcs_address, cluster_id=cluster_id_hex)
79
+ self._job_info_client = JobInfoStorageClient(gcs_aio_client, logs_dir)
80
+ self._log_client = JobLogStorageClient()
81
+ self._entrypoint = entrypoint
82
+
83
+ # Default metadata if not passed by the user.
84
+ self._metadata = {JOB_ID_METADATA_KEY: job_id, JOB_NAME_METADATA_KEY: job_id}
85
+ self._metadata.update(user_metadata)
86
+
87
+ # Event used to signal that a job should be stopped.
88
+ # Set in the `stop_job` method.
89
+ self._stop_event = asyncio.Event()
90
+
91
+ # Windows Job Object used to handle stopping the child processes.
92
+ self._win32_job_object = None
93
+
94
+ # Logger object to persist JobSupervisor logs in separate file.
95
+ self._logger = logging.getLogger(f"{__name__}.supervisor-{job_id}")
96
+ self._configure_logger()
97
+
98
+ def _configure_logger(self) -> None:
99
+ """
100
+ Configure self._logger object to write logs to file based on job
101
+ submission ID and to console.
102
+ """
103
+ supervisor_log_file_name = os.path.join(
104
+ ray._private.worker._global_node.get_logs_dir_path(),
105
+ f"jobs/supervisor-{self._job_id}.log",
106
+ )
107
+ os.makedirs(os.path.dirname(supervisor_log_file_name), exist_ok=True)
108
+ self._logger.addFilter(CoreContextFilter())
109
+ stream_handler = logging.StreamHandler()
110
+ file_handler = logging.FileHandler(supervisor_log_file_name)
111
+ formatter = TextFormatter()
112
+ if ray_constants.env_bool(ray_constants.RAY_BACKEND_LOG_JSON_ENV_VAR, False):
113
+ formatter = JSONFormatter()
114
+ stream_handler.setFormatter(formatter)
115
+ file_handler.setFormatter(formatter)
116
+ self._logger.addHandler(stream_handler)
117
+ self._logger.addHandler(file_handler)
118
+ self._logger.propagate = False
119
+
120
+ def _get_driver_runtime_env(
121
+ self, resources_specified: bool = False
122
+ ) -> Dict[str, Any]:
123
+ """Get the runtime env that should be set in the job driver.
124
+
125
+ Args:
126
+ resources_specified: Whether the user specified resources (CPUs, GPUs,
127
+ custom resources) in the submit_job request. If so, we will skip
128
+ the workaround for GPU detection introduced in #24546, so that the
129
+ behavior matches that of the user specifying resources for any
130
+ other actor.
131
+
132
+ Returns:
133
+ The runtime env that should be set in the job driver.
134
+ """
135
+ # Get the runtime_env set for the supervisor actor.
136
+ curr_runtime_env = dict(ray.get_runtime_context().runtime_env)
137
+ if resources_specified:
138
+ return curr_runtime_env
139
+ # Allow CUDA_VISIBLE_DEVICES to be set normally for the driver's tasks
140
+ # & actors.
141
+ env_vars = curr_runtime_env.get("env_vars", {})
142
+ env_vars.pop(ray_constants.NOSET_CUDA_VISIBLE_DEVICES_ENV_VAR)
143
+ env_vars.pop(ray_constants.RAY_WORKER_NICENESS)
144
+ curr_runtime_env["env_vars"] = env_vars
145
+ return curr_runtime_env
146
+
147
+ def ping(self):
148
+ """Used to check the health of the actor."""
149
+ pass
150
+
151
+ def _exec_entrypoint(self, env: dict, logs_path: str) -> subprocess.Popen:
152
+ """
153
+ Runs the entrypoint command as a child process, streaming stderr &
154
+ stdout to given log files.
155
+
156
+ Unix systems:
157
+ Meanwhile we start a demon process and group driver
158
+ subprocess in same pgid, such that if job actor dies, entire process
159
+ group also fate share with it.
160
+
161
+ Windows systems:
162
+ A jobObject is created to enable fate sharing for the entire process group.
163
+
164
+ Args:
165
+ logs_path: File path on head node's local disk to store driver
166
+ command's stdout & stderr.
167
+ Returns:
168
+ child_process: Child process that runs the driver command. Can be
169
+ terminated or killed upon user calling stop().
170
+ """
171
+ # Open in append mode to avoid overwriting runtime_env setup logs for the
172
+ # supervisor actor, which are also written to the same file.
173
+ with open(logs_path, "a") as logs_file:
174
+ child_process = subprocess.Popen(
175
+ self._entrypoint,
176
+ shell=True,
177
+ start_new_session=True,
178
+ stdout=logs_file,
179
+ stderr=subprocess.STDOUT,
180
+ env=env,
181
+ # Ray intentionally blocks SIGINT in all processes, so if the user wants
182
+ # to stop job through SIGINT, we need to unblock it in the child process
183
+ preexec_fn=(
184
+ (
185
+ lambda: signal.pthread_sigmask(
186
+ signal.SIG_UNBLOCK, {signal.SIGINT}
187
+ )
188
+ )
189
+ if sys.platform != "win32"
190
+ and os.environ.get("RAY_JOB_STOP_SIGNAL") == "SIGINT"
191
+ else None
192
+ ),
193
+ )
194
+ parent_pid = os.getpid()
195
+ child_pid = child_process.pid
196
+ # Create new pgid with new subprocess to execute driver command
197
+
198
+ if sys.platform != "win32":
199
+ try:
200
+ child_pgid = os.getpgid(child_pid)
201
+ except ProcessLookupError:
202
+ # Process died before we could get its pgid.
203
+ return child_process
204
+
205
+ # Open a new subprocess to kill the child process when the parent
206
+ # process dies kill -s 0 parent_pid will succeed if the parent is
207
+ # alive. If it fails, SIGKILL the child process group and exit
208
+ subprocess.Popen(
209
+ f"while kill -s 0 {parent_pid}; do sleep 1; done; kill -9 -{child_pgid}", # noqa: E501
210
+ shell=True,
211
+ # Suppress output
212
+ stdout=subprocess.DEVNULL,
213
+ stderr=subprocess.DEVNULL,
214
+ )
215
+
216
+ elif sys.platform == "win32" and win32api:
217
+ # Create a JobObject to which the child process (and its children)
218
+ # will be connected. This job object can be used to kill the child
219
+ # processes explicitly or when the jobObject gets deleted during
220
+ # garbage collection.
221
+ self._win32_job_object = win32job.CreateJobObject(None, "")
222
+ win32_job_info = win32job.QueryInformationJobObject(
223
+ self._win32_job_object, win32job.JobObjectExtendedLimitInformation
224
+ )
225
+ win32_job_info["BasicLimitInformation"][
226
+ "LimitFlags"
227
+ ] = win32job.JOB_OBJECT_LIMIT_KILL_ON_JOB_CLOSE
228
+ win32job.SetInformationJobObject(
229
+ self._win32_job_object,
230
+ win32job.JobObjectExtendedLimitInformation,
231
+ win32_job_info,
232
+ )
233
+ child_handle = win32api.OpenProcess(
234
+ win32con.PROCESS_TERMINATE | win32con.PROCESS_SET_QUOTA,
235
+ False,
236
+ child_pid,
237
+ )
238
+ win32job.AssignProcessToJobObject(self._win32_job_object, child_handle)
239
+
240
+ return child_process
241
+
242
+ def _get_driver_env_vars(self, resources_specified: bool) -> Dict[str, str]:
243
+ """Returns environment variables that should be set in the driver."""
244
+ # RAY_ADDRESS may be the dashboard URL but not the gcs address,
245
+ # so when the environment variable is not empty, we force set RAY_ADDRESS
246
+ # to "auto" to avoid function `canonicalize_bootstrap_address_or_die` returning
247
+ # the wrong GCS address.
248
+ # TODO(Jialing He, Archit Kulkarni): Definition of Specification RAY_ADDRESS
249
+ if ray_constants.RAY_ADDRESS_ENVIRONMENT_VARIABLE in os.environ:
250
+ os.environ[ray_constants.RAY_ADDRESS_ENVIRONMENT_VARIABLE] = "auto"
251
+ ray_addr = ray._private.services.canonicalize_bootstrap_address_or_die(
252
+ "auto", ray.worker._global_node._ray_params.temp_dir
253
+ )
254
+ assert ray_addr is not None
255
+ return {
256
+ # Set JobConfig for the child process (runtime_env, metadata).
257
+ RAY_JOB_CONFIG_JSON_ENV_VAR: json.dumps(
258
+ {
259
+ "runtime_env": self._get_driver_runtime_env(resources_specified),
260
+ "metadata": self._metadata,
261
+ }
262
+ ),
263
+ # Always set RAY_ADDRESS as find_bootstrap_address address for
264
+ # job submission. In case of local development, prevent user from
265
+ # re-using http://{address}:{dashboard_port} to interact with
266
+ # jobs SDK.
267
+ # TODO:(mwtian) Check why "auto" does not work in entrypoint script
268
+ ray_constants.RAY_ADDRESS_ENVIRONMENT_VARIABLE: ray_addr,
269
+ # Set PYTHONUNBUFFERED=1 to stream logs during the job instead of
270
+ # only streaming them upon completion of the job.
271
+ "PYTHONUNBUFFERED": "1",
272
+ }
273
+
274
+ async def _polling(self, child_process: subprocess.Popen) -> int:
275
+ while child_process is not None:
276
+ return_code = child_process.poll()
277
+ if return_code is not None:
278
+ # subprocess finished with return code
279
+ return return_code
280
+ else:
281
+ # still running, yield control, 0.1s by default
282
+ await asyncio.sleep(self.SUBPROCESS_POLL_PERIOD_S)
283
+
284
+ async def _poll_all(self, processes: List[psutil.Process]):
285
+ """Poll processes until all are completed."""
286
+ while True:
287
+ (_, alive) = psutil.wait_procs(processes, timeout=0)
288
+ if len(alive) == 0:
289
+ return
290
+ else:
291
+ await asyncio.sleep(self.SUBPROCESS_POLL_PERIOD_S)
292
+
293
+ def _kill_processes(self, processes: List[psutil.Process], sig: signal.Signals):
294
+ """Ensure each process is already finished or send a kill signal."""
295
+ for proc in processes:
296
+ try:
297
+ os.kill(proc.pid, sig)
298
+ except ProcessLookupError:
299
+ # Process is already dead
300
+ pass
301
+
302
+ async def run(
303
+ self,
304
+ # Signal actor used in testing to capture PENDING -> RUNNING cases
305
+ _start_signal_actor: Optional[ActorHandle] = None,
306
+ resources_specified: bool = False,
307
+ ):
308
+ """
309
+ Stop and start both happen asynchronously, coordinated by asyncio event
310
+ and coroutine, respectively.
311
+
312
+ 1) Sets job status as running
313
+ 2) Pass runtime env and metadata to subprocess as serialized env
314
+ variables.
315
+ 3) Handle concurrent events of driver execution and
316
+ """
317
+ curr_info = await self._job_info_client.get_info(self._job_id)
318
+ if curr_info is None:
319
+ raise RuntimeError(f"Status could not be retrieved for job {self._job_id}.")
320
+ curr_status = curr_info.status
321
+ curr_message = curr_info.message
322
+ if curr_status == JobStatus.RUNNING:
323
+ raise RuntimeError(
324
+ f"Job {self._job_id} is already in RUNNING state. "
325
+ f"JobSupervisor.run() should only be called once. "
326
+ )
327
+ if curr_status != JobStatus.PENDING:
328
+ raise RuntimeError(
329
+ f"Job {self._job_id} is not in PENDING state. "
330
+ f"Current status is {curr_status} with message {curr_message}."
331
+ )
332
+
333
+ if _start_signal_actor:
334
+ # Block in PENDING state until start signal received.
335
+ await _start_signal_actor.wait.remote()
336
+
337
+ driver_agent_http_address = (
338
+ "http://"
339
+ f"{ray.worker.global_worker.node.node_ip_address}:"
340
+ f"{ray.worker.global_worker.node.dashboard_agent_listen_port}"
341
+ )
342
+ driver_node_id = ray.get_runtime_context().get_node_id()
343
+
344
+ await self._job_info_client.put_status(
345
+ self._job_id,
346
+ JobStatus.RUNNING,
347
+ jobinfo_replace_kwargs={
348
+ "driver_agent_http_address": driver_agent_http_address,
349
+ "driver_node_id": driver_node_id,
350
+ },
351
+ )
352
+
353
+ try:
354
+ # Configure environment variables for the child process.
355
+ env = os.environ.copy()
356
+ # Remove internal Ray flags. They present because JobSuperVisor itself is
357
+ # a Ray worker process but we don't want to pass them to the driver.
358
+ remove_ray_internal_flags_from_env(env)
359
+ # These will *not* be set in the runtime_env, so they apply to the driver
360
+ # only, not its tasks & actors.
361
+ env.update(self._get_driver_env_vars(resources_specified))
362
+
363
+ self._logger.info(
364
+ "Submitting job with RAY_ADDRESS = "
365
+ f"{env[ray_constants.RAY_ADDRESS_ENVIRONMENT_VARIABLE]}"
366
+ )
367
+ log_path = self._log_client.get_log_file_path(self._job_id)
368
+ child_process = self._exec_entrypoint(env, log_path)
369
+ child_pid = child_process.pid
370
+
371
+ polling_task = create_task(self._polling(child_process))
372
+ finished, _ = await asyncio.wait(
373
+ [polling_task, create_task(self._stop_event.wait())],
374
+ return_when=FIRST_COMPLETED,
375
+ )
376
+
377
+ if self._stop_event.is_set():
378
+ polling_task.cancel()
379
+ if sys.platform == "win32" and self._win32_job_object:
380
+ win32job.TerminateJobObject(self._win32_job_object, -1)
381
+ elif sys.platform != "win32":
382
+ stop_signal = os.environ.get("RAY_JOB_STOP_SIGNAL", "SIGTERM")
383
+ if stop_signal not in self.VALID_STOP_SIGNALS:
384
+ self._logger.warning(
385
+ f"{stop_signal} not a valid stop signal. Terminating "
386
+ "job with SIGTERM."
387
+ )
388
+ stop_signal = "SIGTERM"
389
+
390
+ job_process = psutil.Process(child_pid)
391
+ proc_to_kill = [job_process] + job_process.children(recursive=True)
392
+
393
+ # Send stop signal and wait for job to terminate gracefully,
394
+ # otherwise SIGKILL job forcefully after timeout.
395
+ self._kill_processes(proc_to_kill, getattr(signal, stop_signal))
396
+ try:
397
+ stop_job_wait_time = int(
398
+ os.environ.get(
399
+ "RAY_JOB_STOP_WAIT_TIME_S",
400
+ self.DEFAULT_RAY_JOB_STOP_WAIT_TIME_S,
401
+ )
402
+ )
403
+ poll_job_stop_task = create_task(self._poll_all(proc_to_kill))
404
+ await asyncio.wait_for(poll_job_stop_task, stop_job_wait_time)
405
+ self._logger.info(
406
+ f"Job {self._job_id} has been terminated gracefully "
407
+ f"with {stop_signal}."
408
+ )
409
+ except asyncio.TimeoutError:
410
+ self._logger.warning(
411
+ f"Attempt to gracefully terminate job {self._job_id} "
412
+ f"through {stop_signal} has timed out after "
413
+ f"{stop_job_wait_time} seconds. Job is now being "
414
+ "force-killed with SIGKILL."
415
+ )
416
+ self._kill_processes(proc_to_kill, signal.SIGKILL)
417
+
418
+ await self._job_info_client.put_status(self._job_id, JobStatus.STOPPED)
419
+ else:
420
+ # Child process finished execution and no stop event is set
421
+ # at the same time
422
+ assert len(finished) == 1, "Should have only one coroutine done"
423
+ [child_process_task] = finished
424
+ return_code = child_process_task.result()
425
+ self._logger.info(
426
+ f"Job {self._job_id} entrypoint command "
427
+ f"exited with code {return_code}"
428
+ )
429
+ if return_code == 0:
430
+ await self._job_info_client.put_status(
431
+ self._job_id,
432
+ JobStatus.SUCCEEDED,
433
+ driver_exit_code=return_code,
434
+ )
435
+ else:
436
+ log_tail = await self._log_client.get_last_n_log_lines(self._job_id)
437
+ if log_tail is not None and log_tail != "":
438
+ message = (
439
+ "Job entrypoint command "
440
+ f"failed with exit code {return_code}, "
441
+ "last available logs (truncated to 20,000 chars):\n"
442
+ + log_tail
443
+ )
444
+ else:
445
+ message = (
446
+ "Job entrypoint command "
447
+ f"failed with exit code {return_code}. No logs available."
448
+ )
449
+ await self._job_info_client.put_status(
450
+ self._job_id,
451
+ JobStatus.FAILED,
452
+ message=message,
453
+ driver_exit_code=return_code,
454
+ )
455
+ except Exception:
456
+ self._logger.error(
457
+ "Got unexpected exception while trying to execute driver "
458
+ f"command. {traceback.format_exc()}"
459
+ )
460
+ try:
461
+ await self._job_info_client.put_status(
462
+ self._job_id,
463
+ JobStatus.FAILED,
464
+ message=traceback.format_exc(),
465
+ )
466
+ except Exception:
467
+ self._logger.error(
468
+ "Failed to update job status to FAILED. "
469
+ f"Exception: {traceback.format_exc()}"
470
+ )
471
+ finally:
472
+ # clean up actor after tasks are finished
473
+ ray.actor.exit_actor()
474
+
475
+ def stop(self):
476
+ """Set step_event and let run() handle the rest in its asyncio.wait()."""
477
+ self._stop_event.set()
.venv/lib/python3.11/site-packages/ray/dashboard/modules/job/pydantic_models.py ADDED
@@ -0,0 +1,110 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from enum import Enum
2
+ from typing import Any, Dict, Optional
3
+
4
+ from ray._private.pydantic_compat import PYDANTIC_INSTALLED, BaseModel, Field
5
+ from ray.dashboard.modules.job.common import JobStatus
6
+ from ray.util.annotations import PublicAPI
7
+
8
+ # Pydantic is not part of the minimal Ray installation.
9
+ if PYDANTIC_INSTALLED:
10
+
11
+ @PublicAPI(stability="beta")
12
+ class DriverInfo(BaseModel):
13
+ """A class for recording information about the driver related to the job."""
14
+
15
+ id: str = Field(..., description="The id of the driver")
16
+ node_ip_address: str = Field(
17
+ ..., description="The IP address of the node the driver is running on."
18
+ )
19
+ pid: str = Field(
20
+ ..., description="The PID of the worker process the driver is using."
21
+ )
22
+ # TODO(aguo): Add node_id as a field.
23
+
24
+ @PublicAPI(stability="beta")
25
+ class JobType(str, Enum):
26
+ """An enumeration for describing the different job types.
27
+
28
+ NOTE:
29
+ This field is still experimental and may change in the future.
30
+ """
31
+
32
+ #: A job that was initiated by the Ray Jobs API.
33
+ SUBMISSION = "SUBMISSION"
34
+ #: A job that was initiated by a driver script.
35
+ DRIVER = "DRIVER"
36
+
37
+ @PublicAPI(stability="beta")
38
+ class JobDetails(BaseModel):
39
+ """
40
+ Job data with extra details about its driver and its submission.
41
+ """
42
+
43
+ type: JobType = Field(..., description="The type of job.")
44
+ job_id: Optional[str] = Field(
45
+ None,
46
+ description="The job ID. An ID that is created for every job that is "
47
+ "launched in Ray. This can be used to fetch data about jobs using Ray "
48
+ "Core APIs.",
49
+ )
50
+ submission_id: Optional[str] = Field(
51
+ None,
52
+ description="A submission ID is an ID created for every job submitted via"
53
+ "the Ray Jobs API. It can "
54
+ "be used to fetch data about jobs using the Ray Jobs API.",
55
+ )
56
+ driver_info: Optional[DriverInfo] = Field(
57
+ None,
58
+ description="The driver related to this job. For jobs submitted via "
59
+ "the Ray Jobs API, "
60
+ "it is the last driver launched by that job submission, "
61
+ "or None if there is no driver.",
62
+ )
63
+
64
+ # The following fields are copied from JobInfo.
65
+ # TODO(aguo): Inherit from JobInfo once it's migrated to pydantic.
66
+ status: JobStatus = Field(..., description="The status of the job.")
67
+ entrypoint: str = Field(..., description="The entrypoint command for this job.")
68
+ message: Optional[str] = Field(
69
+ None, description="A message describing the status in more detail."
70
+ )
71
+ error_type: Optional[str] = Field(
72
+ None, description="Internal error or user script error."
73
+ )
74
+ start_time: Optional[int] = Field(
75
+ None,
76
+ description="The time when the job was started. " "A Unix timestamp in ms.",
77
+ )
78
+ end_time: Optional[int] = Field(
79
+ None,
80
+ description="The time when the job moved into a terminal state. "
81
+ "A Unix timestamp in ms.",
82
+ )
83
+ metadata: Optional[Dict[str, str]] = Field(
84
+ None, description="Arbitrary user-provided metadata for the job."
85
+ )
86
+ runtime_env: Optional[Dict[str, Any]] = Field(
87
+ None, description="The runtime environment for the job."
88
+ )
89
+ # the node info where the driver running on.
90
+ # - driver_agent_http_address: this node's agent http address
91
+ # - driver_node_id: this node's id.
92
+ driver_agent_http_address: Optional[str] = Field(
93
+ None,
94
+ description="The HTTP address of the JobAgent on the node the job "
95
+ "entrypoint command is running on.",
96
+ )
97
+ driver_node_id: Optional[str] = Field(
98
+ None,
99
+ description="The ID of the node the job entrypoint command is running on.",
100
+ )
101
+ driver_exit_code: Optional[int] = Field(
102
+ None,
103
+ description="The driver process exit code after the driver executed. "
104
+ "Return None if driver doesn't finish executing.",
105
+ )
106
+
107
+ else:
108
+ DriverInfo = None
109
+ JobType = None
110
+ JobDetails = None
.venv/lib/python3.11/site-packages/ray/dashboard/modules/job/sdk.py ADDED
@@ -0,0 +1,492 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import dataclasses
2
+ import logging
3
+ from typing import Any, AsyncIterator, Dict, List, Optional, Union
4
+
5
+ import packaging.version
6
+
7
+ import ray
8
+ from ray.dashboard.modules.dashboard_sdk import SubmissionClient
9
+ from ray.dashboard.modules.job.common import (
10
+ JobDeleteResponse,
11
+ JobLogsResponse,
12
+ JobStatus,
13
+ JobStopResponse,
14
+ JobSubmitRequest,
15
+ JobSubmitResponse,
16
+ )
17
+ from ray.dashboard.modules.job.pydantic_models import JobDetails
18
+ from ray.dashboard.modules.job.utils import strip_keys_with_value_none
19
+ from ray.dashboard.utils import get_address_for_submission_client
20
+ from ray.runtime_env import RuntimeEnv
21
+ from ray.util.annotations import PublicAPI
22
+
23
+ try:
24
+ import aiohttp
25
+ import requests
26
+ except ImportError:
27
+ aiohttp = None
28
+ requests = None
29
+
30
+
31
+ logger = logging.getLogger(__name__)
32
+ logger.setLevel(logging.INFO)
33
+
34
+
35
+ class JobSubmissionClient(SubmissionClient):
36
+ """A local client for submitting and interacting with jobs on a remote cluster.
37
+
38
+ Submits requests over HTTP to the job server on the cluster using the REST API.
39
+
40
+
41
+ Args:
42
+ address: Either (1) the address of the Ray cluster, or (2) the HTTP address
43
+ of the dashboard server on the head node, e.g. "http://<head-node-ip>:8265".
44
+ In case (1) it must be specified as an address that can be passed to
45
+ ray.init(), e.g. a Ray Client address (ray://<head_node_host>:10001),
46
+ or "auto", or "localhost:<port>". If unspecified, will try to connect to
47
+ a running local Ray cluster. This argument is always overridden by the
48
+ RAY_ADDRESS environment variable.
49
+ create_cluster_if_needed: Indicates whether the cluster at the specified
50
+ address needs to already be running. Ray doesn't start a cluster
51
+ before interacting with jobs, but third-party job managers may do so.
52
+ cookies: Cookies to use when sending requests to the HTTP job server.
53
+ metadata: Arbitrary metadata to store along with all jobs. New metadata
54
+ specified per job will be merged with the global metadata provided here
55
+ via a simple dict update.
56
+ headers: Headers to use when sending requests to the HTTP job server, used
57
+ for cases like authentication to a remote cluster.
58
+ verify: Boolean indication to verify the server's TLS certificate or a path to
59
+ a file or directory of trusted certificates. Default: True.
60
+ """
61
+
62
+ def __init__(
63
+ self,
64
+ address: Optional[str] = None,
65
+ create_cluster_if_needed: bool = False,
66
+ cookies: Optional[Dict[str, Any]] = None,
67
+ metadata: Optional[Dict[str, Any]] = None,
68
+ headers: Optional[Dict[str, Any]] = None,
69
+ verify: Optional[Union[str, bool]] = True,
70
+ ):
71
+ self._client_ray_version = ray.__version__
72
+ """Initialize a JobSubmissionClient and check the connection to the cluster."""
73
+ if requests is None:
74
+ raise RuntimeError(
75
+ "The Ray jobs CLI & SDK require the ray[default] "
76
+ "installation: `pip install 'ray[default]'`"
77
+ )
78
+ # Check types of arguments
79
+ if address is not None and not isinstance(address, str):
80
+ raise TypeError(f"address must be a string, got {type(address)}")
81
+ if not isinstance(create_cluster_if_needed, bool):
82
+ raise TypeError(
83
+ f"create_cluster_if_needed must be a bool, got"
84
+ f" {type(create_cluster_if_needed)}"
85
+ )
86
+ if cookies is not None and not isinstance(cookies, dict):
87
+ raise TypeError(f"cookies must be a dict, got {type(cookies)}")
88
+ if metadata is not None and not isinstance(metadata, dict):
89
+ raise TypeError(f"metadata must be a dict, got {type(metadata)}")
90
+ if headers is not None and not isinstance(headers, dict):
91
+ raise TypeError(f"headers must be a dict, got {type(headers)}")
92
+ if not (isinstance(verify, str) or isinstance(verify, bool)):
93
+ raise TypeError(f"verify must be a str or bool, got {type(verify)}")
94
+
95
+ api_server_url = get_address_for_submission_client(address)
96
+
97
+ super().__init__(
98
+ address=api_server_url,
99
+ create_cluster_if_needed=create_cluster_if_needed,
100
+ cookies=cookies,
101
+ metadata=metadata,
102
+ headers=headers,
103
+ verify=verify,
104
+ )
105
+ self._check_connection_and_version(
106
+ min_version="1.9",
107
+ version_error_message="Jobs API is not supported on the Ray "
108
+ "cluster. Please ensure the cluster is "
109
+ "running Ray 1.9 or higher.",
110
+ )
111
+
112
+ # In ray>=2.0, the client sends the new kwarg `submission_id` to the server
113
+ # upon every job submission, which causes servers with ray<2.0 to error.
114
+ if packaging.version.parse(self._client_ray_version) > packaging.version.parse(
115
+ "2.0"
116
+ ):
117
+ self._check_connection_and_version(
118
+ min_version="2.0",
119
+ version_error_message=f"Client Ray version {self._client_ray_version} "
120
+ "is not compatible with the Ray cluster. Please ensure the cluster is "
121
+ "running Ray 2.0 or higher or downgrade the client Ray version.",
122
+ )
123
+
124
+ @PublicAPI(stability="stable")
125
+ def submit_job(
126
+ self,
127
+ *,
128
+ entrypoint: str,
129
+ job_id: Optional[str] = None,
130
+ runtime_env: Optional[Dict[str, Any]] = None,
131
+ metadata: Optional[Dict[str, str]] = None,
132
+ submission_id: Optional[str] = None,
133
+ entrypoint_num_cpus: Optional[Union[int, float]] = None,
134
+ entrypoint_num_gpus: Optional[Union[int, float]] = None,
135
+ entrypoint_memory: Optional[int] = None,
136
+ entrypoint_resources: Optional[Dict[str, float]] = None,
137
+ ) -> str:
138
+ """Submit and execute a job asynchronously.
139
+
140
+ When a job is submitted, it runs once to completion or failure. Retries or
141
+ different runs with different parameters should be handled by the
142
+ submitter. Jobs are bound to the lifetime of a Ray cluster, so if the
143
+ cluster goes down, all running jobs on that cluster will be terminated.
144
+
145
+ Example:
146
+ >>> from ray.job_submission import JobSubmissionClient
147
+ >>> client = JobSubmissionClient("http://127.0.0.1:8265") # doctest: +SKIP
148
+ >>> client.submit_job( # doctest: +SKIP
149
+ ... entrypoint="python script.py",
150
+ ... runtime_env={
151
+ ... "working_dir": "./",
152
+ ... "pip": ["requests==2.26.0"]
153
+ ... }
154
+ ... ) # doctest: +SKIP
155
+ 'raysubmit_4LamXRuQpYdSMg7J'
156
+
157
+ Args:
158
+ entrypoint: The shell command to run for this job.
159
+ submission_id: A unique ID for this job.
160
+ runtime_env: The runtime environment to install and run this job in.
161
+ metadata: Arbitrary data to store along with this job.
162
+ job_id: DEPRECATED. This has been renamed to submission_id
163
+ entrypoint_num_cpus: The quantity of CPU cores to reserve for the execution
164
+ of the entrypoint command, separately from any tasks or actors launched
165
+ by it. Defaults to 0.
166
+ entrypoint_num_gpus: The quantity of GPUs to reserve for the execution
167
+ of the entrypoint command, separately from any tasks or actors launched
168
+ by it. Defaults to 0.
169
+ entrypoint_memory: The quantity of memory to reserve for the
170
+ execution of the entrypoint command, separately from any tasks or
171
+ actors launched by it. Defaults to 0.
172
+ entrypoint_resources: The quantity of custom resources to reserve for the
173
+ execution of the entrypoint command, separately from any tasks or
174
+ actors launched by it.
175
+
176
+ Returns:
177
+ The submission ID of the submitted job. If not specified,
178
+ this is a randomly generated unique ID.
179
+
180
+ Raises:
181
+ RuntimeError: If the request to the job server fails, or if the specified
182
+ submission_id has already been used by a job on this cluster.
183
+ """
184
+ if job_id:
185
+ logger.warning(
186
+ "job_id kwarg is deprecated. Please use submission_id instead."
187
+ )
188
+
189
+ if entrypoint_num_cpus or entrypoint_num_gpus or entrypoint_resources:
190
+ self._check_connection_and_version(
191
+ min_version="2.2",
192
+ version_error_message="`entrypoint_num_cpus`, `entrypoint_num_gpus`, "
193
+ "and `entrypoint_resources` kwargs "
194
+ "are not supported on the Ray cluster. Please ensure the cluster is "
195
+ "running Ray 2.2 or higher.",
196
+ )
197
+
198
+ if entrypoint_memory:
199
+ self._check_connection_and_version(
200
+ min_version="2.8",
201
+ version_error_message="`entrypoint_memory` kwarg "
202
+ "is not supported on the Ray cluster. Please ensure the cluster is "
203
+ "running Ray 2.8 or higher.",
204
+ )
205
+
206
+ runtime_env = runtime_env or {}
207
+ metadata = metadata or {}
208
+ metadata.update(self._default_metadata)
209
+
210
+ self._upload_working_dir_if_needed(runtime_env)
211
+ self._upload_py_modules_if_needed(runtime_env)
212
+
213
+ # Verify worker_process_setup_hook type.
214
+ setup_hook = runtime_env.get("worker_process_setup_hook")
215
+ if setup_hook and not isinstance(setup_hook, str):
216
+ raise ValueError(
217
+ f"Invalid type {type(setup_hook)} for `worker_process_setup_hook`. "
218
+ "When a job submission API is used, `worker_process_setup_hook` "
219
+ "only allows a string type (module name). "
220
+ "Specify `worker_process_setup_hook` via "
221
+ "ray.init within a driver to use a `Callable` type. "
222
+ )
223
+
224
+ # Run the RuntimeEnv constructor to parse local pip/conda requirements files.
225
+ runtime_env = RuntimeEnv(**runtime_env).to_dict()
226
+
227
+ submission_id = submission_id or job_id
228
+ req = JobSubmitRequest(
229
+ entrypoint=entrypoint,
230
+ submission_id=submission_id,
231
+ runtime_env=runtime_env,
232
+ metadata=metadata,
233
+ entrypoint_num_cpus=entrypoint_num_cpus,
234
+ entrypoint_num_gpus=entrypoint_num_gpus,
235
+ entrypoint_memory=entrypoint_memory,
236
+ entrypoint_resources=entrypoint_resources,
237
+ )
238
+
239
+ # Remove keys with value None so that new clients with new optional fields
240
+ # are still compatible with older servers. This is also done on the server,
241
+ # but we do it here as well to be extra defensive.
242
+ json_data = strip_keys_with_value_none(dataclasses.asdict(req))
243
+
244
+ logger.debug(f"Submitting job with submission_id={submission_id}.")
245
+ r = self._do_request("POST", "/api/jobs/", json_data=json_data)
246
+
247
+ if r.status_code == 200:
248
+ return JobSubmitResponse(**r.json()).submission_id
249
+ else:
250
+ self._raise_error(r)
251
+
252
+ @PublicAPI(stability="stable")
253
+ def stop_job(
254
+ self,
255
+ job_id: str,
256
+ ) -> bool:
257
+ """Request a job to exit asynchronously.
258
+
259
+ Attempts to terminate process first, then kills process after timeout.
260
+
261
+ Example:
262
+ >>> from ray.job_submission import JobSubmissionClient
263
+ >>> client = JobSubmissionClient("http://127.0.0.1:8265") # doctest: +SKIP
264
+ >>> sub_id = client.submit_job(entrypoint="sleep 10") # doctest: +SKIP
265
+ >>> client.stop_job(sub_id) # doctest: +SKIP
266
+ True
267
+
268
+ Args:
269
+ job_id: The job ID or submission ID for the job to be stopped.
270
+
271
+ Returns:
272
+ True if the job was running, otherwise False.
273
+
274
+ Raises:
275
+ RuntimeError: If the job does not exist or if the request to the
276
+ job server fails.
277
+ """
278
+ logger.debug(f"Stopping job with job_id={job_id}.")
279
+ r = self._do_request("POST", f"/api/jobs/{job_id}/stop")
280
+
281
+ if r.status_code == 200:
282
+ return JobStopResponse(**r.json()).stopped
283
+ else:
284
+ self._raise_error(r)
285
+
286
+ @PublicAPI(stability="stable")
287
+ def delete_job(
288
+ self,
289
+ job_id: str,
290
+ ) -> bool:
291
+ """Delete a job in a terminal state and all of its associated data.
292
+
293
+ If the job is not already in a terminal state, raises an error.
294
+ This does not delete the job logs from disk.
295
+ Submitting a job with the same submission ID as a previously
296
+ deleted job is not supported and may lead to unexpected behavior.
297
+
298
+ Example:
299
+ >>> from ray.job_submission import JobSubmissionClient
300
+ >>> client = JobSubmissionClient() # doctest: +SKIP
301
+ >>> job_id = client.submit_job(entrypoint="echo hello") # doctest: +SKIP
302
+ >>> client.delete_job(job_id) # doctest: +SKIP
303
+ True
304
+
305
+ Args:
306
+ job_id: submission ID for the job to be deleted.
307
+
308
+ Returns:
309
+ True if the job was deleted, otherwise False.
310
+
311
+ Raises:
312
+ RuntimeError: If the job does not exist, if the request to the
313
+ job server fails, or if the job is not in a terminal state.
314
+ """
315
+ logger.debug(f"Deleting job with job_id={job_id}.")
316
+ r = self._do_request("DELETE", f"/api/jobs/{job_id}")
317
+
318
+ if r.status_code == 200:
319
+ return JobDeleteResponse(**r.json()).deleted
320
+ else:
321
+ self._raise_error(r)
322
+
323
+ @PublicAPI(stability="stable")
324
+ def get_job_info(
325
+ self,
326
+ job_id: str,
327
+ ) -> JobDetails:
328
+ """Get the latest status and other information associated with a job.
329
+
330
+ Example:
331
+ >>> from ray.job_submission import JobSubmissionClient
332
+ >>> client = JobSubmissionClient("http://127.0.0.1:8265") # doctest: +SKIP
333
+ >>> submission_id = client.submit_job(entrypoint="sleep 1") # doctest: +SKIP
334
+ >>> job_submission_client.get_job_info(submission_id) # doctest: +SKIP
335
+ JobInfo(status='SUCCEEDED', message='Job finished successfully.',
336
+ error_type=None, start_time=1647388711, end_time=1647388712,
337
+ metadata={}, runtime_env={})
338
+
339
+ Args:
340
+ job_id: The job ID or submission ID of the job whose information
341
+ is being requested.
342
+
343
+ Returns:
344
+ The JobInfo for the job.
345
+
346
+ Raises:
347
+ RuntimeError: If the job does not exist or if the request to the
348
+ job server fails.
349
+ """
350
+ r = self._do_request("GET", f"/api/jobs/{job_id}")
351
+
352
+ if r.status_code == 200:
353
+ return JobDetails(**r.json())
354
+ else:
355
+ self._raise_error(r)
356
+
357
+ @PublicAPI(stability="stable")
358
+ def list_jobs(self) -> List[JobDetails]:
359
+ """List all jobs along with their status and other information.
360
+
361
+ Lists all jobs that have ever run on the cluster, including jobs that are
362
+ currently running and jobs that are no longer running.
363
+
364
+ Example:
365
+ >>> from ray.job_submission import JobSubmissionClient
366
+ >>> client = JobSubmissionClient("http://127.0.0.1:8265") # doctest: +SKIP
367
+ >>> client.submit_job(entrypoint="echo hello") # doctest: +SKIP
368
+ >>> client.submit_job(entrypoint="sleep 2") # doctest: +SKIP
369
+ >>> client.list_jobs() # doctest: +SKIP
370
+ [JobDetails(status='SUCCEEDED',
371
+ job_id='03000000', type='submission',
372
+ submission_id='raysubmit_4LamXRuQpYdSMg7J',
373
+ message='Job finished successfully.', error_type=None,
374
+ start_time=1647388711, end_time=1647388712, metadata={}, runtime_env={}),
375
+ JobDetails(status='RUNNING',
376
+ job_id='04000000', type='submission',
377
+ submission_id='raysubmit_1dxCeNvG1fCMVNHG',
378
+ message='Job is currently running.', error_type=None,
379
+ start_time=1647454832, end_time=None, metadata={}, runtime_env={})]
380
+
381
+ Returns:
382
+ A dictionary mapping job_ids to their information.
383
+
384
+ Raises:
385
+ RuntimeError: If the request to the job server fails.
386
+ """
387
+ r = self._do_request("GET", "/api/jobs/")
388
+
389
+ if r.status_code == 200:
390
+ jobs_info_json = r.json()
391
+ jobs_info = [
392
+ JobDetails(**job_info_json) for job_info_json in jobs_info_json
393
+ ]
394
+ return jobs_info
395
+ else:
396
+ self._raise_error(r)
397
+
398
+ @PublicAPI(stability="stable")
399
+ def get_job_status(self, job_id: str) -> JobStatus:
400
+ """Get the most recent status of a job.
401
+
402
+ Example:
403
+ >>> from ray.job_submission import JobSubmissionClient
404
+ >>> client = JobSubmissionClient("http://127.0.0.1:8265") # doctest: +SKIP
405
+ >>> client.submit_job(entrypoint="echo hello") # doctest: +SKIP
406
+ >>> client.get_job_status("raysubmit_4LamXRuQpYdSMg7J") # doctest: +SKIP
407
+ 'SUCCEEDED'
408
+
409
+ Args:
410
+ job_id: The job ID or submission ID of the job whose status is being
411
+ requested.
412
+
413
+ Returns:
414
+ The JobStatus of the job.
415
+
416
+ Raises:
417
+ RuntimeError: If the job does not exist or if the request to the
418
+ job server fails.
419
+ """
420
+ return self.get_job_info(job_id).status
421
+
422
+ @PublicAPI(stability="stable")
423
+ def get_job_logs(self, job_id: str) -> str:
424
+ """Get all logs produced by a job.
425
+
426
+ Example:
427
+ >>> from ray.job_submission import JobSubmissionClient
428
+ >>> client = JobSubmissionClient("http://127.0.0.1:8265") # doctest: +SKIP
429
+ >>> sub_id = client.submit_job(entrypoint="echo hello") # doctest: +SKIP
430
+ >>> client.get_job_logs(sub_id) # doctest: +SKIP
431
+ 'hello\\n'
432
+
433
+ Args:
434
+ job_id: The job ID or submission ID of the job whose logs are being
435
+ requested.
436
+
437
+ Returns:
438
+ A string containing the full logs of the job.
439
+
440
+ Raises:
441
+ RuntimeError: If the job does not exist or if the request to the
442
+ job server fails.
443
+ """
444
+ r = self._do_request("GET", f"/api/jobs/{job_id}/logs")
445
+
446
+ if r.status_code == 200:
447
+ return JobLogsResponse(**r.json()).logs
448
+ else:
449
+ self._raise_error(r)
450
+
451
+ @PublicAPI(stability="stable")
452
+ async def tail_job_logs(self, job_id: str) -> AsyncIterator[str]:
453
+ """Get an iterator that follows the logs of a job.
454
+
455
+ Example:
456
+ >>> from ray.job_submission import JobSubmissionClient
457
+ >>> client = JobSubmissionClient("http://127.0.0.1:8265") # doctest: +SKIP
458
+ >>> submission_id = client.submit_job( # doctest: +SKIP
459
+ ... entrypoint="echo hi && sleep 5 && echo hi2")
460
+ >>> async for lines in client.tail_job_logs( # doctest: +SKIP
461
+ ... 'raysubmit_Xe7cvjyGJCyuCvm2'):
462
+ ... print(lines, end="") # doctest: +SKIP
463
+ hi
464
+ hi2
465
+
466
+ Args:
467
+ job_id: The job ID or submission ID of the job whose logs are being
468
+ requested.
469
+
470
+ Returns:
471
+ The iterator.
472
+
473
+ Raises:
474
+ RuntimeError: If the job does not exist or if the request to the
475
+ job server fails.
476
+ """
477
+ async with aiohttp.ClientSession(
478
+ cookies=self._cookies, headers=self._headers
479
+ ) as session:
480
+ ws = await session.ws_connect(
481
+ f"{self._address}/api/jobs/{job_id}/logs/tail", ssl=self._ssl_context
482
+ )
483
+
484
+ while True:
485
+ msg = await ws.receive()
486
+
487
+ if msg.type == aiohttp.WSMsgType.TEXT:
488
+ yield msg.data
489
+ elif msg.type == aiohttp.WSMsgType.CLOSED:
490
+ break
491
+ elif msg.type == aiohttp.WSMsgType.ERROR:
492
+ pass
.venv/lib/python3.11/site-packages/ray/dashboard/modules/job/utils.py ADDED
@@ -0,0 +1,304 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import dataclasses
3
+ import logging
4
+ import os
5
+ import re
6
+ import traceback
7
+ from dataclasses import dataclass
8
+ from typing import Any, AsyncIterator, Dict, List, Optional, Tuple, Union
9
+
10
+ from ray._private import ray_constants
11
+ from ray._private.gcs_utils import GcsAioClient
12
+ from ray.dashboard.modules.job.common import (
13
+ JOB_ID_METADATA_KEY,
14
+ JobInfoStorageClient,
15
+ JobStatus,
16
+ validate_request_type,
17
+ )
18
+ from ray.dashboard.modules.job.pydantic_models import DriverInfo, JobDetails, JobType
19
+ from ray.runtime_env import RuntimeEnv
20
+
21
+ try:
22
+ # package `aiohttp` is not in ray's minimal dependencies
23
+ import aiohttp
24
+ from aiohttp.web import Request, Response
25
+ except Exception:
26
+ aiohttp = None
27
+ Request = None
28
+ Response = None
29
+
30
+
31
+ logger = logging.getLogger(__name__)
32
+
33
+ MAX_CHUNK_LINE_LENGTH = 10
34
+ MAX_CHUNK_CHAR_LENGTH = 20000
35
+
36
+
37
+ async def get_head_node_id(gcs_aio_client: GcsAioClient) -> Optional[str]:
38
+ """Fetches Head node id persisted in GCS"""
39
+ head_node_id_bytes = await gcs_aio_client.internal_kv_get(
40
+ ray_constants.KV_HEAD_NODE_ID_KEY,
41
+ namespace=ray_constants.KV_NAMESPACE_JOB,
42
+ timeout=30,
43
+ )
44
+
45
+ return head_node_id_bytes.decode() if head_node_id_bytes is not None else None
46
+
47
+
48
+ def strip_keys_with_value_none(d: Dict[str, Any]) -> Dict[str, Any]:
49
+ """Strip keys with value None from a dictionary."""
50
+ return {k: v for k, v in d.items() if v is not None}
51
+
52
+
53
+ def redact_url_password(url: str) -> str:
54
+ """Redact any passwords in a URL."""
55
+ secret = re.findall(r"https?:\/\/.*:(.*)@.*", url)
56
+ if len(secret) > 0:
57
+ url = url.replace(f":{secret[0]}@", ":<redacted>@")
58
+
59
+ return url
60
+
61
+
62
+ async def file_tail_iterator(path: str) -> AsyncIterator[Optional[List[str]]]:
63
+ """Yield lines from a file as it's written.
64
+
65
+ Returns lines in batches of up to 10 lines or 20000 characters,
66
+ whichever comes first. If it's a chunk of 20000 characters, then
67
+ the last line that is yielded could be an incomplete line.
68
+ New line characters are kept in the line string.
69
+
70
+ Returns None until the file exists or if no new line has been written.
71
+ """
72
+ if not isinstance(path, str):
73
+ raise TypeError(f"path must be a string, got {type(path)}.")
74
+
75
+ while not os.path.exists(path):
76
+ logger.debug(f"Path {path} doesn't exist yet.")
77
+ yield None
78
+
79
+ EOF = ""
80
+
81
+ with open(path, "r") as f:
82
+ lines = []
83
+
84
+ chunk_char_count = 0
85
+ curr_line = None
86
+
87
+ while True:
88
+ # We want to flush current chunk in following cases:
89
+ # - We accumulated 10 lines
90
+ # - We accumulated at least MAX_CHUNK_CHAR_LENGTH total chars
91
+ # - We reached EOF
92
+ if (
93
+ len(lines) >= 10
94
+ or chunk_char_count > MAX_CHUNK_CHAR_LENGTH
95
+ or curr_line == EOF
96
+ ):
97
+ # Too many lines, return 10 lines in this chunk, and then
98
+ # continue reading the file.
99
+ yield lines or None
100
+
101
+ lines = []
102
+ chunk_char_count = 0
103
+
104
+ # Read next line
105
+ curr_line = f.readline()
106
+
107
+ # `readline` will return
108
+ # - '' for EOF
109
+ # - '\n' for an empty line in the file
110
+ if curr_line != EOF:
111
+ # Add line to current chunk
112
+ lines.append(curr_line)
113
+ chunk_char_count += len(curr_line)
114
+ else:
115
+ # If EOF is reached sleep for 1s before continuing
116
+ await asyncio.sleep(1)
117
+
118
+
119
+ async def parse_and_validate_request(
120
+ req: Request, request_type: dataclass
121
+ ) -> Union[dataclass, Response]:
122
+ """Parse request and cast to request type.
123
+
124
+ Remove keys with value None to allow newer client versions with new optional fields
125
+ to work with older servers.
126
+
127
+ If parsing failed, return a Response object with status 400 and stacktrace instead.
128
+
129
+ Args:
130
+ req: aiohttp request object.
131
+ request_type: dataclass type to cast request to.
132
+
133
+ Returns:
134
+ Parsed request object or Response object with status 400 and stacktrace.
135
+ """
136
+ import aiohttp
137
+
138
+ json_data = strip_keys_with_value_none(await req.json())
139
+ try:
140
+ return validate_request_type(json_data, request_type)
141
+ except Exception as e:
142
+ logger.info(f"Got invalid request type: {e}")
143
+ return Response(
144
+ text=traceback.format_exc(),
145
+ status=aiohttp.web.HTTPBadRequest.status_code,
146
+ )
147
+
148
+
149
+ async def get_driver_jobs(
150
+ gcs_aio_client: GcsAioClient,
151
+ job_or_submission_id: Optional[str] = None,
152
+ timeout: Optional[int] = None,
153
+ ) -> Tuple[Dict[str, JobDetails], Dict[str, DriverInfo]]:
154
+ """Returns a tuple of dictionaries related to drivers.
155
+
156
+ The first dictionary contains all driver jobs and is keyed by the job's id.
157
+ The second dictionary contains drivers that belong to submission jobs.
158
+ It's keyed by the submission job's submission id.
159
+ Only the last driver of a submission job is returned.
160
+
161
+ An optional job_or_submission_id filter can be provided to only return
162
+ jobs with the job id or submission id.
163
+ """
164
+ job_infos = await gcs_aio_client.get_all_job_info(
165
+ job_or_submission_id=job_or_submission_id,
166
+ skip_submission_job_info_field=True,
167
+ skip_is_running_tasks_field=True,
168
+ timeout=timeout,
169
+ )
170
+ # Sort jobs from GCS to follow convention of returning only last driver
171
+ # of submission job.
172
+ sorted_job_infos = sorted(
173
+ job_infos.values(), key=lambda job_table_entry: job_table_entry.job_id.hex()
174
+ )
175
+
176
+ jobs = {}
177
+ submission_job_drivers = {}
178
+ for job_table_entry in sorted_job_infos:
179
+ if job_table_entry.config.ray_namespace.startswith(
180
+ ray_constants.RAY_INTERNAL_NAMESPACE_PREFIX
181
+ ):
182
+ # Skip jobs in any _ray_internal_ namespace
183
+ continue
184
+ job_id = job_table_entry.job_id.hex()
185
+ metadata = dict(job_table_entry.config.metadata)
186
+ job_submission_id = metadata.get(JOB_ID_METADATA_KEY)
187
+ if not job_submission_id:
188
+ driver = DriverInfo(
189
+ id=job_id,
190
+ node_ip_address=job_table_entry.driver_address.ip_address,
191
+ pid=str(job_table_entry.driver_pid),
192
+ )
193
+ job = JobDetails(
194
+ job_id=job_id,
195
+ type=JobType.DRIVER,
196
+ status=JobStatus.SUCCEEDED
197
+ if job_table_entry.is_dead
198
+ else JobStatus.RUNNING,
199
+ entrypoint=job_table_entry.entrypoint,
200
+ start_time=job_table_entry.start_time,
201
+ end_time=job_table_entry.end_time,
202
+ metadata=metadata,
203
+ runtime_env=RuntimeEnv.deserialize(
204
+ job_table_entry.config.runtime_env_info.serialized_runtime_env
205
+ ).to_dict(),
206
+ driver_info=driver,
207
+ )
208
+ jobs[job_id] = job
209
+ else:
210
+ driver = DriverInfo(
211
+ id=job_id,
212
+ node_ip_address=job_table_entry.driver_address.ip_address,
213
+ pid=str(job_table_entry.driver_pid),
214
+ )
215
+ submission_job_drivers[job_submission_id] = driver
216
+
217
+ return jobs, submission_job_drivers
218
+
219
+
220
+ async def find_job_by_ids(
221
+ gcs_aio_client: GcsAioClient,
222
+ job_info_client: JobInfoStorageClient,
223
+ job_or_submission_id: str,
224
+ ) -> Optional[JobDetails]:
225
+ """
226
+ Attempts to find the job with a given submission_id or job id.
227
+ """
228
+ # First try to find by job_id
229
+ driver_jobs, submission_job_drivers = await get_driver_jobs(
230
+ gcs_aio_client, job_or_submission_id=job_or_submission_id
231
+ )
232
+ job = driver_jobs.get(job_or_submission_id)
233
+ if job:
234
+ return job
235
+ # Try to find a driver with the given id
236
+ submission_id = next(
237
+ (
238
+ id
239
+ for id, driver in submission_job_drivers.items()
240
+ if driver.id == job_or_submission_id
241
+ ),
242
+ None,
243
+ )
244
+
245
+ if not submission_id:
246
+ # If we didn't find a driver with the given id,
247
+ # then lets try to search for a submission with given id
248
+ submission_id = job_or_submission_id
249
+
250
+ job_info = await job_info_client.get_info(submission_id)
251
+ if job_info:
252
+ driver = submission_job_drivers.get(submission_id)
253
+ job = JobDetails(
254
+ **dataclasses.asdict(job_info),
255
+ submission_id=submission_id,
256
+ job_id=driver.id if driver else None,
257
+ driver_info=driver,
258
+ type=JobType.SUBMISSION,
259
+ )
260
+ return job
261
+
262
+ return None
263
+
264
+
265
+ async def find_jobs_by_job_ids(
266
+ gcs_aio_client: GcsAioClient,
267
+ job_info_client: JobInfoStorageClient,
268
+ job_ids: List[str],
269
+ ) -> Dict[str, JobDetails]:
270
+ """
271
+ Returns a dictionary of submission jobs with the given job ids, keyed by the job id.
272
+
273
+ This only accepts job ids and not submission ids.
274
+ """
275
+ driver_jobs, submission_job_drivers = await get_driver_jobs(gcs_aio_client)
276
+
277
+ # Filter down to the request job_ids
278
+ driver_jobs = {key: job for key, job in driver_jobs.items() if key in job_ids}
279
+ submission_job_drivers = {
280
+ key: job for key, job in submission_job_drivers.items() if job.id in job_ids
281
+ }
282
+
283
+ # Fetch job details for each job
284
+ job_submission_ids = submission_job_drivers.keys()
285
+ job_infos = await asyncio.gather(
286
+ *[
287
+ job_info_client.get_info(submission_id)
288
+ for submission_id in job_submission_ids
289
+ ]
290
+ )
291
+
292
+ return {
293
+ **driver_jobs,
294
+ **{
295
+ submission_job_drivers.get(submission_id).id: JobDetails(
296
+ **dataclasses.asdict(job_info),
297
+ submission_id=submission_id,
298
+ job_id=submission_job_drivers.get(submission_id).id,
299
+ driver_info=submission_job_drivers.get(submission_id),
300
+ type=JobType.SUBMISSION,
301
+ )
302
+ for job_info, submission_id in zip(job_infos, job_submission_ids)
303
+ },
304
+ }
.venv/lib/python3.11/site-packages/ray/dashboard/modules/log/__init__.py ADDED
File without changes
.venv/lib/python3.11/site-packages/ray/dashboard/modules/log/__pycache__/log_utils.cpython-311.pyc ADDED
Binary file (775 Bytes). View file
 
.venv/lib/python3.11/site-packages/ray/dashboard/modules/log/log_agent.py ADDED
@@ -0,0 +1,404 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import concurrent.futures
3
+ import io
4
+ import logging
5
+ import os
6
+ from pathlib import Path
7
+ from typing import Optional
8
+
9
+ import grpc
10
+
11
+ import ray.dashboard.modules.log.log_consts as log_consts
12
+ import ray.dashboard.modules.log.log_utils as log_utils
13
+ import ray.dashboard.optional_utils as dashboard_optional_utils
14
+ import ray.dashboard.utils as dashboard_utils
15
+ from ray._private.ray_constants import env_integer
16
+ from ray.core.generated import reporter_pb2, reporter_pb2_grpc
17
+
18
+ logger = logging.getLogger(__name__)
19
+ routes = dashboard_optional_utils.DashboardAgentRouteTable
20
+
21
+ # 64 KB
22
+ BLOCK_SIZE = 1 << 16
23
+
24
+ # Keep-alive interval for reading the file
25
+ DEFAULT_KEEP_ALIVE_INTERVAL_SEC = 1
26
+
27
+ RAY_DASHBOARD_LOG_TASK_LOG_SEARCH_MAX_WORKER_COUNT = env_integer(
28
+ "RAY_DASHBOARD_LOG_TASK_LOG_SEARCH_MAX_WORKER_COUNT", default=2
29
+ )
30
+
31
+
32
+ def find_offset_of_content_in_file(
33
+ file: io.BufferedIOBase, content: bytes, start_offset: int = 0
34
+ ) -> int:
35
+ """Find the offset of the first occurrence of content in a file.
36
+
37
+ Args:
38
+ file: File object
39
+ content: Content to find
40
+ start_offset: Start offset to read from, inclusive.
41
+
42
+ Returns:
43
+ Offset of the first occurrence of content in a file.
44
+ """
45
+ logger.debug(f"Finding offset of content {content} in file")
46
+ file.seek(start_offset, io.SEEK_SET) # move file pointer to start of file
47
+ offset = start_offset
48
+ while True:
49
+ # Read in block
50
+ block_data = file.read(BLOCK_SIZE)
51
+ if block_data == b"":
52
+ # Stop reading
53
+ return -1
54
+ # Find the offset of the first occurrence of content in the block
55
+ block_offset = block_data.find(content)
56
+ if block_offset != -1:
57
+ # Found the offset in the block
58
+ return offset + block_offset
59
+ # Continue reading
60
+ offset += len(block_data)
61
+
62
+
63
+ def find_end_offset_file(file: io.BufferedIOBase) -> int:
64
+ """
65
+ Find the offset of the end of a file without changing the file pointer.
66
+
67
+ Args:
68
+ file: File object
69
+
70
+ Returns:
71
+ Offset of the end of a file.
72
+ """
73
+ old_pos = file.tell() # store old position
74
+ file.seek(0, io.SEEK_END) # move file pointer to end of file
75
+ end = file.tell() # return end of file offset
76
+ file.seek(old_pos, io.SEEK_SET)
77
+ return end
78
+
79
+
80
+ def find_end_offset_next_n_lines_from_offset(
81
+ file: io.BufferedIOBase, start_offset: int, n: int
82
+ ) -> int:
83
+ """
84
+ Find the offsets of next n lines from a start offset.
85
+
86
+ Args:
87
+ file: File object
88
+ start_offset: Start offset to read from, inclusive.
89
+ n: Number of lines to find.
90
+
91
+ Returns:
92
+ Offset of the end of the next n line (exclusive)
93
+ """
94
+ file.seek(start_offset) # move file pointer to start offset
95
+ end_offset = None
96
+ for _ in range(n): # loop until we find n lines or reach end of file
97
+ line = file.readline() # read a line and consume new line character
98
+ if not line: # end of file
99
+ break
100
+ end_offset = file.tell() # end offset.
101
+
102
+ logger.debug(f"Found next {n} lines from {start_offset} offset")
103
+ return (
104
+ end_offset if end_offset is not None else file.seek(0, io.SEEK_END)
105
+ ) # return last line offset or end of file offset if no lines found
106
+
107
+
108
+ def find_start_offset_last_n_lines_from_offset(
109
+ file: io.BufferedIOBase, offset: int, n: int, block_size: int = BLOCK_SIZE
110
+ ) -> int:
111
+ """
112
+ Find the offset of the beginning of the line of the last X lines from an offset.
113
+
114
+ Args:
115
+ file: File object
116
+ offset: Start offset from which to find last X lines, -1 means end of file.
117
+ The offset is exclusive, i.e. data at the offset is not included
118
+ in the result.
119
+ n: Number of lines to find
120
+ block_size: Block size to read from file
121
+
122
+ Returns:
123
+ Offset of the beginning of the line of the last X lines from a start offset.
124
+ """
125
+ logger.debug(f"Finding last {n} lines from {offset} offset")
126
+ if offset == -1:
127
+ offset = file.seek(0, io.SEEK_END) # move file pointer to end of file
128
+ else:
129
+ file.seek(offset, io.SEEK_SET) # move file pointer to start offset
130
+
131
+ if n == 0:
132
+ return offset
133
+ nbytes_from_end = (
134
+ 0 # Number of bytes that should be tailed from the end of the file
135
+ )
136
+ # Non new line terminating offset, adjust the line count and treat the non-newline
137
+ # terminated line as the last line. e.g. line 1\nline 2
138
+ file.seek(max(0, offset - 1), os.SEEK_SET)
139
+ if file.read(1) != b"\n":
140
+ n -= 1
141
+
142
+ # Remaining number of lines to tail
143
+ lines_more = n
144
+ read_offset = max(0, offset - block_size)
145
+ # So that we know how much to read on the last block (the block 0)
146
+ prev_offset = offset
147
+
148
+ while lines_more >= 0 and read_offset >= 0:
149
+ # Seek to the current block start
150
+ file.seek(read_offset, 0)
151
+ # Read the current block (or less than block) data
152
+ block_data = file.read(min(block_size, prev_offset - read_offset))
153
+ num_lines = block_data.count(b"\n")
154
+ if num_lines > lines_more:
155
+ # This is the last block to read.
156
+ # Need to find the offset of exact number of lines to tail
157
+ # in the block.
158
+ # Use `split` here to split away the extra lines, i.e.
159
+ # first `num_lines - lines_more` lines.
160
+ lines = block_data.split(b"\n", num_lines - lines_more)
161
+ # Added the len of those lines that at the end of the block.
162
+ nbytes_from_end += len(lines[-1])
163
+ break
164
+
165
+ # Need to read more blocks.
166
+ lines_more -= num_lines
167
+ nbytes_from_end += len(block_data)
168
+
169
+ if read_offset == 0:
170
+ # We have read all blocks (since the start)
171
+ break
172
+ # Continuing with the previous block
173
+ prev_offset = read_offset
174
+ read_offset = max(0, read_offset - block_size)
175
+
176
+ offset_read_start = offset - nbytes_from_end
177
+ assert (
178
+ offset_read_start >= 0
179
+ ), f"Read start offset({offset_read_start}) should be non-negative"
180
+ return offset_read_start
181
+
182
+
183
+ async def _stream_log_in_chunk(
184
+ context: grpc.aio.ServicerContext,
185
+ file: io.BufferedIOBase,
186
+ start_offset: int,
187
+ end_offset: int = -1,
188
+ keep_alive_interval_sec: int = -1,
189
+ block_size: int = BLOCK_SIZE,
190
+ ):
191
+ """Streaming log in chunk from start to end offset.
192
+
193
+ Stream binary file content in chunks from start offset to an end
194
+ offset if provided, else to the end of the file.
195
+
196
+ Args:
197
+ context: gRPC server side context
198
+ file: Binary file to stream
199
+ start_offset: File offset where streaming starts
200
+ end_offset: If -1, implying streaming til the EOF.
201
+ keep_alive_interval_sec: Duration for which streaming will be
202
+ retried when reaching the file end, -1 means no retry.
203
+ block_size: Number of bytes per chunk, exposed for testing
204
+
205
+ Return:
206
+ Async generator of StreamReply
207
+ """
208
+ assert "b" in file.mode, "Only binary file is supported."
209
+ assert not (
210
+ keep_alive_interval_sec >= 0 and end_offset != -1
211
+ ), "Keep-alive is not allowed when specifying an end offset"
212
+
213
+ file.seek(start_offset, 0)
214
+ cur_offset = start_offset
215
+
216
+ # Until gRPC is done
217
+ while not context.done():
218
+ # Read in block
219
+ if end_offset != -1:
220
+ to_read = min(end_offset - cur_offset, block_size)
221
+ else:
222
+ to_read = block_size
223
+
224
+ bytes = file.read(to_read)
225
+
226
+ if bytes == b"":
227
+ # Stop reading
228
+ if keep_alive_interval_sec >= 0:
229
+ await asyncio.sleep(keep_alive_interval_sec)
230
+ # Try reading again
231
+ continue
232
+
233
+ # Have read the entire file, done
234
+ break
235
+ logger.debug(f"Sending {len(bytes)} bytes at {cur_offset}")
236
+ yield reporter_pb2.StreamLogReply(data=bytes)
237
+
238
+ # Have read the requested section [start_offset, end_offset), done
239
+ cur_offset += len(bytes)
240
+ if end_offset != -1 and cur_offset >= end_offset:
241
+ break
242
+
243
+
244
+ class LogAgent(dashboard_utils.DashboardAgentModule):
245
+ def __init__(self, dashboard_agent):
246
+ super().__init__(dashboard_agent)
247
+ log_utils.register_mimetypes()
248
+ routes.static("/logs", self._dashboard_agent.log_dir, show_index=True)
249
+
250
+ async def run(self, server):
251
+ pass
252
+
253
+ @staticmethod
254
+ def is_minimal_module():
255
+ return False
256
+
257
+
258
+ _task_log_search_worker_pool = concurrent.futures.ThreadPoolExecutor(
259
+ max_workers=RAY_DASHBOARD_LOG_TASK_LOG_SEARCH_MAX_WORKER_COUNT
260
+ )
261
+
262
+
263
+ class LogAgentV1Grpc(dashboard_utils.DashboardAgentModule):
264
+ def __init__(self, dashboard_agent):
265
+ super().__init__(dashboard_agent)
266
+
267
+ async def run(self, server):
268
+ if server:
269
+ reporter_pb2_grpc.add_LogServiceServicer_to_server(self, server)
270
+
271
+ @property
272
+ def node_id(self) -> Optional[str]:
273
+ return self._dashboard_agent.get_node_id()
274
+
275
+ @staticmethod
276
+ def is_minimal_module():
277
+ # Dashboard is only available with non-minimal install now.
278
+ return False
279
+
280
+ async def ListLogs(self, request, context):
281
+ """
282
+ Lists all files in the active Ray logs directory.
283
+
284
+ Part of `LogService` gRPC.
285
+
286
+ NOTE: These RPCs are used by state_head.py, not log_head.py
287
+ """
288
+ path = Path(self._dashboard_agent.log_dir)
289
+ if not path.exists():
290
+ raise FileNotFoundError(
291
+ f"Could not find log dir at path: {self._dashboard_agent.log_dir}"
292
+ "It is unexpected. Please report an issue to Ray Github."
293
+ )
294
+ log_files = []
295
+ for p in path.glob(request.glob_filter):
296
+ log_files.append(str(p.relative_to(path)) + ("/" if p.is_dir() else ""))
297
+ return reporter_pb2.ListLogsReply(log_files=log_files)
298
+
299
+ @classmethod
300
+ def _resolve_filename(cls, root_log_dir: Path, filename: str) -> Path:
301
+ """
302
+ Resolves the file path relative to the root log directory.
303
+
304
+ Args:
305
+ root_log_dir: Root log directory.
306
+ filename: File path relative to the root log directory.
307
+
308
+ Raises:
309
+ FileNotFoundError: If the file path is invalid.
310
+
311
+ Returns:
312
+ The absolute file path resolved from the root log directory.
313
+ """
314
+ if not Path(filename).is_absolute():
315
+ filepath = root_log_dir / filename
316
+ else:
317
+ filepath = Path(filename)
318
+
319
+ # We want to allow relative paths that include symlinks pointing outside of the
320
+ # `root_log_dir`, so use `os.path.abspath` instead of `Path.resolve()` because
321
+ # `os.path.abspath` does not resolve symlinks.
322
+ filepath = Path(os.path.abspath(filepath))
323
+
324
+ if not filepath.is_file():
325
+ raise FileNotFoundError(f"A file is not found at: {filepath}")
326
+
327
+ try:
328
+ filepath.relative_to(root_log_dir)
329
+ except ValueError as e:
330
+ raise FileNotFoundError(f"{filepath} not in {root_log_dir}: {e}")
331
+
332
+ # Fully resolve the path before returning (including following symlinks).
333
+ return filepath.resolve()
334
+
335
+ async def StreamLog(self, request, context):
336
+ """
337
+ Streams the log in real time starting from `request.lines` number of lines from
338
+ the end of the file if `request.keep_alive == True`. Else, it terminates the
339
+ stream once there are no more bytes to read from the log file.
340
+
341
+ Part of `LogService` gRPC.
342
+
343
+ NOTE: These RPCs are used by state_head.py, not log_head.py
344
+ """
345
+ # NOTE: If the client side connection is closed, this handler will
346
+ # be automatically terminated.
347
+ lines = request.lines if request.lines else 1000
348
+
349
+ try:
350
+ filepath = self._resolve_filename(
351
+ Path(self._dashboard_agent.log_dir), request.log_file_name
352
+ )
353
+ except FileNotFoundError as e:
354
+ await context.send_initial_metadata([[log_consts.LOG_GRPC_ERROR, str(e)]])
355
+ else:
356
+ with open(filepath, "rb") as f:
357
+ await context.send_initial_metadata([])
358
+
359
+ # Default stream entire file
360
+ start_offset = (
361
+ request.start_offset if request.HasField("start_offset") else 0
362
+ )
363
+ end_offset = (
364
+ request.end_offset
365
+ if request.HasField("end_offset")
366
+ else find_end_offset_file(f)
367
+ )
368
+
369
+ if lines != -1:
370
+ # If specified tail line number, cap the start offset
371
+ # with lines from the current end offset
372
+ start_offset = max(
373
+ find_start_offset_last_n_lines_from_offset(
374
+ f, offset=end_offset, n=lines
375
+ ),
376
+ start_offset,
377
+ )
378
+
379
+ # If keep alive: following the log every 'interval'
380
+ keep_alive_interval_sec = -1
381
+ if request.keep_alive:
382
+ keep_alive_interval_sec = (
383
+ request.interval
384
+ if request.interval
385
+ else DEFAULT_KEEP_ALIVE_INTERVAL_SEC
386
+ )
387
+
388
+ # When following (keep_alive), it will read beyond the end
389
+ end_offset = -1
390
+
391
+ logger.info(
392
+ f"Tailing logs from {start_offset} to {end_offset} for "
393
+ f"lines={lines}, with keep_alive={keep_alive_interval_sec}"
394
+ )
395
+
396
+ # Read and send the file data in chunk
397
+ async for chunk_res in _stream_log_in_chunk(
398
+ context=context,
399
+ file=f,
400
+ start_offset=start_offset,
401
+ end_offset=end_offset,
402
+ keep_alive_interval_sec=keep_alive_interval_sec,
403
+ ):
404
+ yield chunk_res
.venv/lib/python3.11/site-packages/ray/dashboard/modules/log/log_consts.py ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ MIME_TYPES = {
2
+ "text/plain": [".err", ".out", ".log"],
3
+ }
4
+
5
+ LOG_GRPC_ERROR = "log_grpc_status"
6
+
7
+ # 10 seconds
8
+ GRPC_TIMEOUT = 10
.venv/lib/python3.11/site-packages/ray/dashboard/modules/log/log_manager.py ADDED
@@ -0,0 +1,481 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import re
3
+ from collections import defaultdict
4
+ from typing import AsyncIterable, Awaitable, Callable, Dict, List, Optional, Tuple
5
+
6
+ from ray import ActorID, NodeID, WorkerID
7
+ from ray._private.pydantic_compat import BaseModel
8
+ from ray.core.generated.gcs_pb2 import ActorTableData
9
+ from ray.dashboard.modules.job.common import JOB_LOGS_PATH_TEMPLATE
10
+ from ray.util.state.common import (
11
+ DEFAULT_RPC_TIMEOUT,
12
+ GetLogOptions,
13
+ protobuf_to_task_state_dict,
14
+ )
15
+ from ray.util.state.exception import DataSourceUnavailable
16
+ from ray.util.state.state_manager import StateDataSourceClient
17
+
18
+ if BaseModel is None:
19
+ raise ModuleNotFoundError("Please install pydantic via `pip install pydantic`.")
20
+
21
+
22
+ logger = logging.getLogger(__name__)
23
+
24
+ WORKER_LOG_PATTERN = re.compile(".*worker-([0-9a-f]+)-([0-9a-f]+)-(\d+).(out|err)")
25
+
26
+
27
+ class ResolvedStreamFileInfo(BaseModel):
28
+ # The node id where the log file is located.
29
+ node_id: str
30
+
31
+ # The log file path name. Could be a relative path relative to ray's logging folder,
32
+ # or an absolute path.
33
+ filename: str
34
+
35
+ # Start offset in the log file to stream from. None to indicate beginning of
36
+ # the file, or determined by last tail lines.
37
+ start_offset: Optional[int]
38
+
39
+ # End offset in the log file to stream from. None to indicate the end of the file.
40
+ end_offset: Optional[int]
41
+
42
+
43
+ class LogsManager:
44
+ def __init__(self, data_source_client: StateDataSourceClient):
45
+ self.client = data_source_client
46
+
47
+ @property
48
+ def data_source_client(self) -> StateDataSourceClient:
49
+ return self.client
50
+
51
+ def ip_to_node_id(self, node_ip: Optional[str]):
52
+ """Resolve the node id from a given node ip.
53
+
54
+ Args:
55
+ node_ip: The node ip.
56
+
57
+ Returns:
58
+ node_id if there's a node id that matches the given node ip and is alive.
59
+ None otherwise.
60
+ """
61
+ return self.client.ip_to_node_id(node_ip)
62
+
63
+ async def list_logs(
64
+ self, node_id: str, timeout: int, glob_filter: str = "*"
65
+ ) -> Dict[str, List[str]]:
66
+ """Return a list of log files on a given node id filtered by the glob.
67
+
68
+ Args:
69
+ node_id: The node id where log files present.
70
+ timeout: The timeout of the API.
71
+ glob_filter: The glob filter to filter out log files.
72
+
73
+ Returns:
74
+ Dictionary of {component_name -> list of log files}
75
+
76
+ Raises:
77
+ DataSourceUnavailable: If a source is unresponsive.
78
+ """
79
+ self._verify_node_registered(node_id)
80
+ reply = await self.client.list_logs(node_id, glob_filter, timeout=timeout)
81
+ return self._categorize_log_files(reply.log_files)
82
+
83
+ async def stream_logs(
84
+ self,
85
+ options: GetLogOptions,
86
+ get_actor_fn: Callable[[ActorID], Awaitable[Optional[ActorTableData]]],
87
+ ) -> AsyncIterable[bytes]:
88
+ """Generate a stream of logs in bytes.
89
+
90
+ Args:
91
+ options: The option for streaming logs.
92
+
93
+ Return:
94
+ Async generator of streamed logs in bytes.
95
+ """
96
+ node_id = options.node_id or self.ip_to_node_id(options.node_ip)
97
+
98
+ res = await self.resolve_filename(
99
+ node_id=node_id,
100
+ log_filename=options.filename,
101
+ actor_id=options.actor_id,
102
+ task_id=options.task_id,
103
+ attempt_number=options.attempt_number,
104
+ pid=options.pid,
105
+ get_actor_fn=get_actor_fn,
106
+ timeout=options.timeout,
107
+ suffix=options.suffix,
108
+ submission_id=options.submission_id,
109
+ )
110
+
111
+ keep_alive = options.media_type == "stream"
112
+ stream = await self.client.stream_log(
113
+ node_id=res.node_id,
114
+ log_file_name=res.filename,
115
+ keep_alive=keep_alive,
116
+ lines=options.lines,
117
+ interval=options.interval,
118
+ # If we keepalive logs connection, we shouldn't have timeout
119
+ # otherwise the stream will be terminated forcefully
120
+ # after the deadline is expired.
121
+ timeout=options.timeout if not keep_alive else None,
122
+ start_offset=res.start_offset,
123
+ end_offset=res.end_offset,
124
+ )
125
+
126
+ async for streamed_log in stream:
127
+ yield streamed_log.data
128
+
129
+ def _verify_node_registered(self, node_id: str):
130
+ if node_id not in self.client.get_all_registered_log_agent_ids():
131
+ raise DataSourceUnavailable(
132
+ f"Given node id {node_id} is not available. "
133
+ "It's either the node is dead, or it is not registered. "
134
+ "Use `ray list nodes` "
135
+ "to see the node status. If the node is registered, "
136
+ "it is highly likely "
137
+ "a transient issue. Try again."
138
+ )
139
+ assert node_id is not None
140
+
141
+ async def _resolve_job_filename(self, sub_job_id: str) -> Tuple[str, str]:
142
+ """Return the log file name and node id for a given job submission id.
143
+
144
+ Args:
145
+ sub_job_id: The job submission id.
146
+
147
+ Returns:
148
+ The log file name and node id.
149
+ """
150
+ job_infos = await self.client.get_job_info(timeout=DEFAULT_RPC_TIMEOUT)
151
+ target_job = None
152
+ for job_info in job_infos:
153
+ if job_info.submission_id == sub_job_id:
154
+ target_job = job_info
155
+ break
156
+ if target_job is None:
157
+ logger.info(f"Submission job ID {sub_job_id} not found.")
158
+ return None, None
159
+
160
+ node_id = job_info.driver_node_id
161
+ if node_id is None:
162
+ raise ValueError(
163
+ f"Job {sub_job_id} has no driver node id info. "
164
+ "This is likely a bug. Please file an issue."
165
+ )
166
+
167
+ log_filename = JOB_LOGS_PATH_TEMPLATE.format(submission_id=sub_job_id)
168
+ return node_id, log_filename
169
+
170
+ async def _resolve_worker_file(
171
+ self,
172
+ node_id_hex: str,
173
+ worker_id_hex: Optional[str],
174
+ pid: Optional[int],
175
+ suffix: str,
176
+ timeout: int,
177
+ ) -> Optional[str]:
178
+ """Resolve worker log file."""
179
+ if worker_id_hex is not None and pid is not None:
180
+ raise ValueError(
181
+ f"Only one of worker id({worker_id_hex}) or pid({pid}) should be"
182
+ "provided."
183
+ )
184
+
185
+ if worker_id_hex is not None:
186
+ log_files = await self.list_logs(
187
+ node_id_hex, timeout, glob_filter=f"*{worker_id_hex}*{suffix}"
188
+ )
189
+ else:
190
+ log_files = await self.list_logs(
191
+ node_id_hex, timeout, glob_filter=f"*{pid}*{suffix}"
192
+ )
193
+
194
+ # Find matching worker logs.
195
+ for filename in [*log_files["worker_out"], *log_files["worker_err"]]:
196
+ # Worker logs look like worker-[worker_id]-[job_id]-[pid].out
197
+ if worker_id_hex is not None:
198
+ worker_id_from_filename = WORKER_LOG_PATTERN.match(filename).group(1)
199
+ if worker_id_from_filename == worker_id_hex:
200
+ return filename
201
+ else:
202
+ worker_pid_from_filename = int(
203
+ WORKER_LOG_PATTERN.match(filename).group(3)
204
+ )
205
+ if worker_pid_from_filename == pid:
206
+ return filename
207
+ return None
208
+
209
+ async def _resolve_actor_filename(
210
+ self,
211
+ actor_id: ActorID,
212
+ get_actor_fn: Callable[[ActorID], Awaitable[Optional[ActorTableData]]],
213
+ suffix: str,
214
+ timeout: int,
215
+ ):
216
+ """
217
+ Resolve actor log file
218
+ Args:
219
+ actor_id: The actor id.
220
+ get_actor_fn: The function to get actor information.
221
+ suffix: The suffix of the log file.
222
+ timeout: Timeout in seconds.
223
+ Returns:
224
+ The log file name and node id.
225
+
226
+ Raises:
227
+ ValueError if actor data is not found or get_actor_fn is not provided.
228
+ """
229
+ if get_actor_fn is None:
230
+ raise ValueError("get_actor_fn needs to be specified for actor_id")
231
+
232
+ actor_data = await get_actor_fn(actor_id)
233
+ if actor_data is None:
234
+ raise ValueError(f"Actor ID {actor_id} not found.")
235
+ # TODO(sang): Only the latest worker id can be obtained from
236
+ # actor information now. That means, if actors are restarted,
237
+ # there's no way for us to get the past worker ids.
238
+ worker_id_binary = actor_data.address.worker_id
239
+ if not worker_id_binary:
240
+ raise ValueError(
241
+ f"Worker ID for Actor ID {actor_id} not found. "
242
+ "Actor is not scheduled yet."
243
+ )
244
+ worker_id = WorkerID(worker_id_binary)
245
+ node_id_binary = actor_data.address.raylet_id
246
+ if not node_id_binary:
247
+ raise ValueError(
248
+ f"Node ID for Actor ID {actor_id} not found. "
249
+ "Actor is not scheduled yet."
250
+ )
251
+ node_id = NodeID(node_id_binary)
252
+ self._verify_node_registered(node_id.hex())
253
+ log_filename = await self._resolve_worker_file(
254
+ node_id_hex=node_id.hex(),
255
+ worker_id_hex=worker_id.hex(),
256
+ pid=None,
257
+ suffix=suffix,
258
+ timeout=timeout,
259
+ )
260
+ return node_id.hex(), log_filename
261
+
262
+ async def _resolve_task_filename(
263
+ self, task_id: str, attempt_number: int, suffix: str, timeout: int
264
+ ):
265
+ """
266
+ Resolve log file for a task.
267
+
268
+ Args:
269
+ task_id: The task id.
270
+ attempt_number: The attempt number.
271
+ suffix: The suffix of the log file, e.g. out or err
272
+ timeout: Timeout in seconds.
273
+
274
+ Returns:
275
+ The log file name, node id, the start and end offsets of the
276
+ corresponding task log in the file.
277
+
278
+ Raises:
279
+ FileNotFoundError if the log file is not found.
280
+ ValueError if the suffix is not out or err.
281
+
282
+ """
283
+ log_filename = None
284
+ node_id = None
285
+ start_offset = None
286
+ end_offset = None
287
+
288
+ if suffix not in ["out", "err"]:
289
+ raise ValueError(f"Suffix {suffix} is not supported.")
290
+
291
+ reply = await self.client.get_all_task_info(
292
+ filters=[("task_id", "=", task_id)], timeout=timeout
293
+ )
294
+ # Check if the task is found.
295
+ if len(reply.events_by_task) == 0:
296
+ raise FileNotFoundError(
297
+ f"Could not find log file for task: {task_id}"
298
+ f" (attempt {attempt_number}) with suffix: {suffix}"
299
+ )
300
+ task_event = None
301
+ for t in reply.events_by_task:
302
+ if t.attempt_number == attempt_number:
303
+ task_event = t
304
+ break
305
+
306
+ if task_event is None:
307
+ raise FileNotFoundError(
308
+ "Could not find log file for task attempt:"
309
+ f"{task_id}({attempt_number})"
310
+ )
311
+ # Get the worker id and node id.
312
+ task = protobuf_to_task_state_dict(task_event)
313
+
314
+ worker_id = task.get("worker_id", None)
315
+ node_id = task.get("node_id", None)
316
+ log_info = task.get("task_log_info", None)
317
+ actor_id = task.get("actor_id", None)
318
+
319
+ if node_id is None:
320
+ raise FileNotFoundError(
321
+ "Could not find log file for task attempt."
322
+ f"{task_id}({attempt_number}) due to missing node info."
323
+ )
324
+
325
+ if log_info is None and actor_id is not None:
326
+ # This is a concurrent actor task. The logs will be interleaved.
327
+ # So we return the log file of the actor instead.
328
+ raise FileNotFoundError(
329
+ f"For actor task, please query actor log for "
330
+ f"actor({actor_id}): e.g. ray logs actor --id {actor_id} . Or "
331
+ "set RAY_ENABLE_RECORD_ACTOR_TASK_LOGGING=1 in actor's runtime env "
332
+ "or when starting the cluster. Recording actor task's log could be "
333
+ "expensive, so Ray turns it off by default."
334
+ )
335
+ elif log_info is None:
336
+ raise FileNotFoundError(
337
+ "Could not find log file for task attempt:"
338
+ f"{task_id}({attempt_number})."
339
+ f"Worker id = {worker_id}, node id = {node_id},"
340
+ f"log_info = {log_info}"
341
+ )
342
+
343
+ filename_key = "stdout_file" if suffix == "out" else "stderr_file"
344
+ log_filename = log_info.get(filename_key, None)
345
+ if log_filename is None:
346
+ raise FileNotFoundError(
347
+ f"Missing log filename info in {log_info} for task {task_id},"
348
+ f"attempt {attempt_number}"
349
+ )
350
+
351
+ start_offset = log_info.get(f"std{suffix}_start", None)
352
+ end_offset = log_info.get(f"std{suffix}_end", None)
353
+
354
+ return node_id, log_filename, start_offset, end_offset
355
+
356
+ async def resolve_filename(
357
+ self,
358
+ *,
359
+ node_id: Optional[str] = None,
360
+ log_filename: Optional[str] = None,
361
+ actor_id: Optional[str] = None,
362
+ task_id: Optional[str] = None,
363
+ attempt_number: Optional[int] = None,
364
+ pid: Optional[str] = None,
365
+ get_actor_fn: Optional[
366
+ Callable[[ActorID], Awaitable[Optional[ActorTableData]]]
367
+ ] = None,
368
+ timeout: int = DEFAULT_RPC_TIMEOUT,
369
+ suffix: str = "out",
370
+ submission_id: Optional[str] = None,
371
+ ) -> ResolvedStreamFileInfo:
372
+ """Return the file name given all options.
373
+
374
+ Args:
375
+ node_id: The node's id from which logs are resolved.
376
+ log_filename: Filename of the log file.
377
+ actor_id: Id of the actor that generates the log file.
378
+ task_id: Id of the task that generates the log file.
379
+ pid: Id of the worker process that generates the log file.
380
+ get_actor_fn: Callback to get the actor's data by id.
381
+ timeout: Timeout for the gRPC to listing logs on the node
382
+ specified by `node_id`.
383
+ suffix: Log suffix if no `log_filename` is provided, when
384
+ resolving by other ids'. Default to "out".
385
+ submission_id: The submission id for a submission job.
386
+ """
387
+ start_offset = None
388
+ end_offset = None
389
+ if suffix not in ["out", "err"]:
390
+ raise ValueError(f"Suffix {suffix} is not supported. ")
391
+
392
+ # TODO(rickyx): We should make sure we do some sort of checking on the log
393
+ # filename
394
+ if actor_id:
395
+ node_id, log_filename = await self._resolve_actor_filename(
396
+ ActorID.from_hex(actor_id), get_actor_fn, suffix, timeout
397
+ )
398
+
399
+ elif task_id:
400
+ (
401
+ node_id,
402
+ log_filename,
403
+ start_offset,
404
+ end_offset,
405
+ ) = await self._resolve_task_filename(
406
+ task_id, attempt_number, suffix, timeout
407
+ )
408
+
409
+ elif submission_id:
410
+ node_id, log_filename = await self._resolve_job_filename(submission_id)
411
+
412
+ elif pid:
413
+ if node_id is None:
414
+ raise ValueError(
415
+ "Node id needs to be specified for resolving"
416
+ f" filenames of pid {pid}"
417
+ )
418
+ self._verify_node_registered(node_id)
419
+ log_filename = await self._resolve_worker_file(
420
+ node_id_hex=node_id,
421
+ worker_id_hex=None,
422
+ pid=pid,
423
+ suffix=suffix,
424
+ timeout=timeout,
425
+ )
426
+
427
+ if log_filename is None:
428
+ raise FileNotFoundError(
429
+ "Could not find a log file. Please make sure the given "
430
+ "option exists in the cluster.\n"
431
+ f"\tnode_id: {node_id}\n"
432
+ f"\tfilename: {log_filename}\n"
433
+ f"\tactor_id: {actor_id}\n"
434
+ f"\ttask_id: {task_id}\n"
435
+ f"\tpid: {pid}\n"
436
+ f"\tsuffix: {suffix}\n"
437
+ f"\tsubmission_id: {submission_id}\n"
438
+ f"\tattempt_number: {attempt_number}\n"
439
+ )
440
+
441
+ res = ResolvedStreamFileInfo(
442
+ node_id=node_id,
443
+ filename=log_filename,
444
+ start_offset=start_offset,
445
+ end_offset=end_offset,
446
+ )
447
+ logger.info(f"Resolved log file: {res}")
448
+ return res
449
+
450
+ def _categorize_log_files(self, log_files: List[str]) -> Dict[str, List[str]]:
451
+ """Categorize the given log files after filterieng them out using a given glob.
452
+
453
+ Returns:
454
+ Dictionary of {component_name -> list of log files}
455
+ """
456
+ result = defaultdict(list)
457
+ for log_file in log_files:
458
+ if "worker" in log_file and (log_file.endswith(".out")):
459
+ result["worker_out"].append(log_file)
460
+ elif "worker" in log_file and (log_file.endswith(".err")):
461
+ result["worker_err"].append(log_file)
462
+ elif "core-worker" in log_file and log_file.endswith(".log"):
463
+ result["core_worker"].append(log_file)
464
+ elif "core-driver" in log_file and log_file.endswith(".log"):
465
+ result["driver"].append(log_file)
466
+ elif "raylet." in log_file:
467
+ result["raylet"].append(log_file)
468
+ elif "gcs_server." in log_file:
469
+ result["gcs_server"].append(log_file)
470
+ elif "log_monitor" in log_file:
471
+ result["internal"].append(log_file)
472
+ elif "monitor" in log_file:
473
+ result["autoscaler"].append(log_file)
474
+ elif "agent." in log_file:
475
+ result["agent"].append(log_file)
476
+ elif "dashboard." in log_file:
477
+ result["dashboard"].append(log_file)
478
+ else:
479
+ result["internal"].append(log_file)
480
+
481
+ return result
.venv/lib/python3.11/site-packages/ray/dashboard/modules/log/log_utils.py ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ import mimetypes
2
+
3
+ import ray.dashboard.modules.log.log_consts as log_consts
4
+
5
+
6
+ def register_mimetypes():
7
+ for _type, extensions in log_consts.MIME_TYPES.items():
8
+ for ext in extensions:
9
+ mimetypes.add_type(_type, ext)
.venv/lib/python3.11/site-packages/ray/dashboard/modules/node/__pycache__/node_head.cpython-311.pyc ADDED
Binary file (24.5 kB). View file
 
.venv/lib/python3.11/site-packages/ray/dashboard/modules/serve/__init__.py ADDED
File without changes
.venv/lib/python3.11/site-packages/ray/dashboard/modules/serve/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (200 Bytes). View file
 
.venv/lib/python3.11/site-packages/ray/dashboard/modules/serve/__pycache__/sdk.cpython-311.pyc ADDED
Binary file (4.12 kB). View file
 
.venv/lib/python3.11/site-packages/ray/dashboard/modules/serve/__pycache__/serve_agent.cpython-311.pyc ADDED
Binary file (765 Bytes). View file
 
.venv/lib/python3.11/site-packages/ray/dashboard/modules/serve/__pycache__/serve_head.cpython-311.pyc ADDED
Binary file (729 Bytes). View file
 
.venv/lib/python3.11/site-packages/ray/dashboard/modules/serve/__pycache__/serve_rest_api_impl.cpython-311.pyc ADDED
Binary file (13.7 kB). View file