koichi12 commited on
Commit
b75420e
·
verified ·
1 Parent(s): 48635e5

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/consts.cpython-311.pyc +0 -0
  2. .venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/dashboard.cpython-311.pyc +0 -0
  3. .venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/dashboard_metrics.cpython-311.pyc +0 -0
  4. .venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/datacenter.cpython-311.pyc +0 -0
  5. .venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/http_server_agent.cpython-311.pyc +0 -0
  6. .venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/http_server_head.cpython-311.pyc +0 -0
  7. .venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/k8s_utils.cpython-311.pyc +0 -0
  8. .venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/memory_utils.cpython-311.pyc +0 -0
  9. .venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/optional_utils.cpython-311.pyc +0 -0
  10. .venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/routes.cpython-311.pyc +0 -0
  11. .venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/state_api_utils.cpython-311.pyc +0 -0
  12. .venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/timezone_utils.cpython-311.pyc +0 -0
  13. .venv/lib/python3.11/site-packages/ray/dashboard/client/build/static/media/roboto-latin-100.c2aa4ab115bf9c6057cb.woff2 +0 -0
  14. .venv/lib/python3.11/site-packages/ray/dashboard/client/build/static/media/roboto-latin-100italic.7f839a8652da29745ce4.woff2 +0 -0
  15. .venv/lib/python3.11/site-packages/ray/dashboard/client/build/static/media/roboto-latin-300.37a7069dc30fc663c878.woff2 +0 -0
  16. .venv/lib/python3.11/site-packages/ray/dashboard/client/build/static/media/roboto-latin-300italic.bd5b7a13f2c52b531a2a.woff +0 -0
  17. .venv/lib/python3.11/site-packages/ray/dashboard/client/build/static/media/roboto-latin-300italic.c64e7e354c88e613c77c.woff2 +0 -0
  18. .venv/lib/python3.11/site-packages/ray/dashboard/client/build/static/media/roboto-latin-500.f5b74d7ffcdf85b9dd60.woff2 +0 -0
  19. .venv/lib/python3.11/site-packages/ray/dashboard/client/build/static/media/roboto-latin-500italic.0d8bb5b3ee5f5dac9e44.woff2 +0 -0
  20. .venv/lib/python3.11/site-packages/ray/dashboard/client/build/static/media/roboto-latin-700.c18ee39fb002ad58b6dc.woff2 +0 -0
  21. .venv/lib/python3.11/site-packages/ray/dashboard/client/build/static/media/roboto-latin-700italic.7d8125ff7f707231fd89.woff2 +0 -0
  22. .venv/lib/python3.11/site-packages/ray/dashboard/modules/actor/__init__.py +0 -0
  23. .venv/lib/python3.11/site-packages/ray/dashboard/modules/actor/__pycache__/__init__.cpython-311.pyc +0 -0
  24. .venv/lib/python3.11/site-packages/ray/dashboard/modules/actor/__pycache__/actor_consts.cpython-311.pyc +0 -0
  25. .venv/lib/python3.11/site-packages/ray/dashboard/modules/actor/__pycache__/actor_head.cpython-311.pyc +0 -0
  26. .venv/lib/python3.11/site-packages/ray/dashboard/modules/actor/actor_consts.py +5 -0
  27. .venv/lib/python3.11/site-packages/ray/dashboard/modules/actor/actor_head.py +290 -0
  28. .venv/lib/python3.11/site-packages/ray/dashboard/modules/event/__init__.py +0 -0
  29. .venv/lib/python3.11/site-packages/ray/dashboard/modules/event/__pycache__/__init__.cpython-311.pyc +0 -0
  30. .venv/lib/python3.11/site-packages/ray/dashboard/modules/event/__pycache__/event_agent.cpython-311.pyc +0 -0
  31. .venv/lib/python3.11/site-packages/ray/dashboard/modules/event/__pycache__/event_consts.cpython-311.pyc +0 -0
  32. .venv/lib/python3.11/site-packages/ray/dashboard/modules/event/__pycache__/event_head.cpython-311.pyc +0 -0
  33. .venv/lib/python3.11/site-packages/ray/dashboard/modules/event/__pycache__/event_utils.cpython-311.pyc +0 -0
  34. .venv/lib/python3.11/site-packages/ray/dashboard/modules/event/event_agent.py +133 -0
  35. .venv/lib/python3.11/site-packages/ray/dashboard/modules/event/event_consts.py +21 -0
  36. .venv/lib/python3.11/site-packages/ray/dashboard/modules/event/event_head.py +212 -0
  37. .venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/__init__.py +0 -0
  38. .venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/__pycache__/__init__.cpython-311.pyc +0 -0
  39. .venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/__pycache__/grafana_dashboard_factory.cpython-311.pyc +0 -0
  40. .venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/__pycache__/metrics_head.cpython-311.pyc +0 -0
  41. .venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/__pycache__/templates.cpython-311.pyc +0 -0
  42. .venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/dashboards/common.py +70 -0
  43. .venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/dashboards/data_dashboard_panels.py +551 -0
  44. .venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/dashboards/default_dashboard_panels.py +478 -0
  45. .venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/dashboards/serve_dashboard_panels.py +420 -0
  46. .venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/dashboards/serve_deployment_dashboard_panels.py +259 -0
  47. .venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/dashboards/serve_deployment_grafana_dashboard_base.json +223 -0
  48. .venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/dashboards/serve_grafana_dashboard_base.json +188 -0
  49. .venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/export/prometheus/prometheus.yml +12 -0
  50. .venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/grafana_dashboard_factory.py +301 -0
.venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/consts.cpython-311.pyc ADDED
Binary file (3.41 kB). View file
 
.venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/dashboard.cpython-311.pyc ADDED
Binary file (10.2 kB). View file
 
.venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/dashboard_metrics.cpython-311.pyc ADDED
Binary file (4.28 kB). View file
 
.venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/datacenter.cpython-311.pyc ADDED
Binary file (14 kB). View file
 
.venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/http_server_agent.cpython-311.pyc ADDED
Binary file (4.98 kB). View file
 
.venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/http_server_head.cpython-311.pyc ADDED
Binary file (16.5 kB). View file
 
.venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/k8s_utils.cpython-311.pyc ADDED
Binary file (5.35 kB). View file
 
.venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/memory_utils.cpython-311.pyc ADDED
Binary file (24.6 kB). View file
 
.venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/optional_utils.cpython-311.pyc ADDED
Binary file (9.41 kB). View file
 
.venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/routes.cpython-311.pyc ADDED
Binary file (10.7 kB). View file
 
.venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/state_api_utils.cpython-311.pyc ADDED
Binary file (11.4 kB). View file
 
.venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/timezone_utils.cpython-311.pyc ADDED
Binary file (3.34 kB). View file
 
.venv/lib/python3.11/site-packages/ray/dashboard/client/build/static/media/roboto-latin-100.c2aa4ab115bf9c6057cb.woff2 ADDED
Binary file (15.8 kB). View file
 
.venv/lib/python3.11/site-packages/ray/dashboard/client/build/static/media/roboto-latin-100italic.7f839a8652da29745ce4.woff2 ADDED
Binary file (17 kB). View file
 
.venv/lib/python3.11/site-packages/ray/dashboard/client/build/static/media/roboto-latin-300.37a7069dc30fc663c878.woff2 ADDED
Binary file (15.8 kB). View file
 
.venv/lib/python3.11/site-packages/ray/dashboard/client/build/static/media/roboto-latin-300italic.bd5b7a13f2c52b531a2a.woff ADDED
Binary file (22.2 kB). View file
 
.venv/lib/python3.11/site-packages/ray/dashboard/client/build/static/media/roboto-latin-300italic.c64e7e354c88e613c77c.woff2 ADDED
Binary file (17.4 kB). View file
 
.venv/lib/python3.11/site-packages/ray/dashboard/client/build/static/media/roboto-latin-500.f5b74d7ffcdf85b9dd60.woff2 ADDED
Binary file (15.9 kB). View file
 
.venv/lib/python3.11/site-packages/ray/dashboard/client/build/static/media/roboto-latin-500italic.0d8bb5b3ee5f5dac9e44.woff2 ADDED
Binary file (17.3 kB). View file
 
.venv/lib/python3.11/site-packages/ray/dashboard/client/build/static/media/roboto-latin-700.c18ee39fb002ad58b6dc.woff2 ADDED
Binary file (15.8 kB). View file
 
.venv/lib/python3.11/site-packages/ray/dashboard/client/build/static/media/roboto-latin-700italic.7d8125ff7f707231fd89.woff2 ADDED
Binary file (17 kB). View file
 
.venv/lib/python3.11/site-packages/ray/dashboard/modules/actor/__init__.py ADDED
File without changes
.venv/lib/python3.11/site-packages/ray/dashboard/modules/actor/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (200 Bytes). View file
 
.venv/lib/python3.11/site-packages/ray/dashboard/modules/actor/__pycache__/actor_consts.cpython-311.pyc ADDED
Binary file (459 Bytes). View file
 
.venv/lib/python3.11/site-packages/ray/dashboard/modules/actor/__pycache__/actor_head.cpython-311.pyc ADDED
Binary file (13.9 kB). View file
 
.venv/lib/python3.11/site-packages/ray/dashboard/modules/actor/actor_consts.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ import ray
2
+
3
+ ACTOR_CHANNEL = "ACTOR"
4
+ NIL_NODE_ID = ray.NodeID.nil().hex()
5
+ RETRY_GET_ALL_ACTOR_INFO_INTERVAL_SECONDS = 1
.venv/lib/python3.11/site-packages/ray/dashboard/modules/actor/actor_head.py ADDED
@@ -0,0 +1,290 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import logging
3
+ from collections import defaultdict, deque
4
+ from concurrent.futures import ThreadPoolExecutor
5
+ from typing import Any, Dict
6
+
7
+ import aiohttp.web
8
+
9
+ import ray
10
+ import ray.dashboard.optional_utils as dashboard_optional_utils
11
+ import ray.dashboard.utils as dashboard_utils
12
+ from ray._private.gcs_pubsub import GcsAioActorSubscriber
13
+ from ray._private.utils import get_or_create_event_loop
14
+ from ray.dashboard.consts import GCS_RPC_TIMEOUT_SECONDS
15
+ from ray.dashboard.datacenter import DataOrganizer, DataSource
16
+ from ray.dashboard.modules.actor import actor_consts
17
+
18
+ logger = logging.getLogger(__name__)
19
+ routes = dashboard_optional_utils.DashboardHeadRouteTable
20
+
21
+ MAX_DESTROYED_ACTORS_TO_CACHE = max(
22
+ 0, ray._config.maximum_gcs_destroyed_actor_cached_count()
23
+ )
24
+
25
+ ACTOR_CLEANUP_FREQUENCY = 1 # seconds
26
+
27
+
28
+ ACTOR_TABLE_STATE_COLUMNS = (
29
+ "state",
30
+ "address",
31
+ "numRestarts",
32
+ "timestamp",
33
+ "pid",
34
+ "exitDetail",
35
+ "startTime",
36
+ "endTime",
37
+ "reprName",
38
+ )
39
+
40
+
41
+ def actor_table_data_to_dict(message):
42
+ orig_message = dashboard_utils.message_to_dict(
43
+ message,
44
+ {
45
+ "actorId",
46
+ "parentId",
47
+ "jobId",
48
+ "workerId",
49
+ "rayletId",
50
+ "callerId",
51
+ "taskId",
52
+ "parentTaskId",
53
+ "sourceActorId",
54
+ "placementGroupId",
55
+ },
56
+ always_print_fields_with_no_presence=True,
57
+ )
58
+ # The complete schema for actor table is here:
59
+ # src/ray/protobuf/gcs.proto
60
+ # It is super big and for dashboard, we don't need that much information.
61
+ # Only preserve the necessary ones here for memory usage.
62
+ fields = {
63
+ "actorId",
64
+ "jobId",
65
+ "pid",
66
+ "address",
67
+ "state",
68
+ "name",
69
+ "numRestarts",
70
+ "timestamp",
71
+ "className",
72
+ "startTime",
73
+ "endTime",
74
+ "reprName",
75
+ "placementGroupId",
76
+ "callSite",
77
+ }
78
+ light_message = {k: v for (k, v) in orig_message.items() if k in fields}
79
+ light_message["actorClass"] = orig_message["className"]
80
+ exit_detail = "-"
81
+ if "deathCause" in orig_message:
82
+ context = orig_message["deathCause"]
83
+ if "actorDiedErrorContext" in context:
84
+ exit_detail = context["actorDiedErrorContext"]["errorMessage"] # noqa
85
+ elif "runtimeEnvFailedContext" in context:
86
+ exit_detail = context["runtimeEnvFailedContext"]["errorMessage"] # noqa
87
+ elif "actorUnschedulableContext" in context:
88
+ exit_detail = context["actorUnschedulableContext"]["errorMessage"] # noqa
89
+ elif "creationTaskFailureContext" in context:
90
+ exit_detail = context["creationTaskFailureContext"][
91
+ "formattedExceptionString"
92
+ ] # noqa
93
+ light_message["exitDetail"] = exit_detail
94
+ light_message["startTime"] = int(light_message["startTime"])
95
+ light_message["endTime"] = int(light_message["endTime"])
96
+ light_message["requiredResources"] = dict(message.required_resources)
97
+
98
+ return light_message
99
+
100
+
101
+ class ActorHead(dashboard_utils.DashboardHeadModule):
102
+ def __init__(self, config: dashboard_utils.DashboardHeadModuleConfig):
103
+ super().__init__(config)
104
+
105
+ self._gcs_actor_channel_subscriber = None
106
+ # A queue of dead actors in order of when they died
107
+ self.destroyed_actors_queue = deque()
108
+
109
+ # -- Internal state --
110
+ self._loop = get_or_create_event_loop()
111
+ # NOTE: This executor is intentionally constrained to just 1 thread to
112
+ # limit its concurrency, therefore reducing potential for GIL contention
113
+ self._executor = ThreadPoolExecutor(
114
+ max_workers=1, thread_name_prefix="actor_head_executor"
115
+ )
116
+
117
+ async def _update_actors(self):
118
+ """
119
+ Processes actor info. First gets all actors from GCS, then subscribes to
120
+ actor updates. For each actor update, updates DataSource.node_actors and
121
+ DataSource.actors.
122
+ """
123
+
124
+ # To prevent Time-of-check to time-of-use issue [1], the get-all-actor-info
125
+ # happens after the subscription. That is, an update between get-all-actor-info
126
+ # and the subscription is not missed.
127
+ #
128
+ # [1] https://en.wikipedia.org/wiki/Time-of-check_to_time-of-use
129
+ gcs_addr = self.gcs_address
130
+ actor_channel_subscriber = GcsAioActorSubscriber(address=gcs_addr)
131
+ await actor_channel_subscriber.subscribe()
132
+
133
+ # Get all actor info.
134
+ while True:
135
+ try:
136
+ logger.info("Getting all actor info from GCS.")
137
+
138
+ actor_dicts = await self._get_all_actors()
139
+ # Update actors
140
+ DataSource.actors.reset(actor_dicts)
141
+
142
+ # Update node actors and job actors.
143
+ node_actors = defaultdict(dict)
144
+ for actor_id_bytes, updated_actor_table in actor_dicts.items():
145
+ node_id = updated_actor_table["address"]["rayletId"]
146
+ # Update only when node_id is not Nil.
147
+ if node_id != actor_consts.NIL_NODE_ID:
148
+ node_actors[node_id][actor_id_bytes] = updated_actor_table
149
+
150
+ # Update node's actor info
151
+ DataSource.node_actors.reset(node_actors)
152
+
153
+ logger.info("Received %d actor info from GCS.", len(actor_dicts))
154
+
155
+ # Break, once all initial actors are successfully fetched
156
+ break
157
+ except Exception as e:
158
+ logger.exception("Error Getting all actor info from GCS", exc_info=e)
159
+ await asyncio.sleep(
160
+ actor_consts.RETRY_GET_ALL_ACTOR_INFO_INTERVAL_SECONDS
161
+ )
162
+
163
+ # Pull incremental updates from the GCS channel
164
+ while True:
165
+ try:
166
+ updated_actor_table_entries = await self._poll_updated_actor_table_data(
167
+ actor_channel_subscriber
168
+ )
169
+
170
+ for (
171
+ actor_id,
172
+ updated_actor_table,
173
+ ) in updated_actor_table_entries.items():
174
+ self._process_updated_actor_table(actor_id, updated_actor_table)
175
+
176
+ # TODO emit metrics
177
+ logger.debug(
178
+ f"Total events processed: {len(updated_actor_table_entries)}, "
179
+ f"queue size: {actor_channel_subscriber.queue_size}"
180
+ )
181
+
182
+ except Exception as e:
183
+ logger.exception("Error processing actor info from GCS.", exc_info=e)
184
+
185
+ async def _poll_updated_actor_table_data(
186
+ self, actor_channel_subscriber: GcsAioActorSubscriber
187
+ ) -> Dict[str, Dict[str, Any]]:
188
+ # TODO make batch size configurable
189
+ batch = await actor_channel_subscriber.poll(batch_size=200)
190
+
191
+ # NOTE: We're offloading conversion to a TPE to make sure we're not
192
+ # blocking the event-loop for prolonged period of time irrespective
193
+ # of the batch size
194
+ def _convert_to_dict():
195
+ return {
196
+ actor_id_bytes.hex(): actor_table_data_to_dict(actor_table_data_message)
197
+ for actor_id_bytes, actor_table_data_message in batch
198
+ if actor_id_bytes is not None
199
+ }
200
+
201
+ return await self._loop.run_in_executor(self._executor, _convert_to_dict)
202
+
203
+ def _process_updated_actor_table(
204
+ self, actor_id: str, actor_table_data: Dict[str, Any]
205
+ ):
206
+ """NOTE: This method has to be executed on the event-loop, provided that it
207
+ accesses DataSource data structures (to follow its thread-safety model)"""
208
+
209
+ # If actor is not new registered but updated, we only update
210
+ # states related fields.
211
+ actor = DataSource.actors.get(actor_id)
212
+
213
+ if actor and actor_table_data["state"] != "DEPENDENCIES_UNREADY":
214
+ for k in ACTOR_TABLE_STATE_COLUMNS:
215
+ if k in actor_table_data:
216
+ actor[k] = actor_table_data[k]
217
+ actor_table_data = actor
218
+
219
+ actor_id = actor_table_data["actorId"]
220
+ node_id = actor_table_data["address"]["rayletId"]
221
+
222
+ if actor_table_data["state"] == "DEAD":
223
+ self.destroyed_actors_queue.append(actor_id)
224
+
225
+ # Update actors.
226
+ DataSource.actors[actor_id] = actor_table_data
227
+ # Update node actors (only when node_id is not Nil).
228
+ if node_id != actor_consts.NIL_NODE_ID:
229
+ node_actors = DataSource.node_actors.get(node_id, {})
230
+ node_actors[actor_id] = actor_table_data
231
+ DataSource.node_actors[node_id] = node_actors
232
+
233
+ async def _get_all_actors(self) -> Dict[str, dict]:
234
+ actors = await self.gcs_aio_client.get_all_actor_info(
235
+ timeout=GCS_RPC_TIMEOUT_SECONDS
236
+ )
237
+
238
+ # NOTE: We're offloading conversion to a TPE to make sure we're not
239
+ # blocking the event-loop for prolonged period of time for large clusters
240
+ def _convert_to_dict():
241
+ return {
242
+ actor_id.hex(): actor_table_data_to_dict(actor_table_data)
243
+ for actor_id, actor_table_data in actors.items()
244
+ }
245
+
246
+ return await self._loop.run_in_executor(self._executor, _convert_to_dict)
247
+
248
+ async def _cleanup_actors(self):
249
+ while True:
250
+ try:
251
+ while len(self.destroyed_actors_queue) > MAX_DESTROYED_ACTORS_TO_CACHE:
252
+ actor_id = self.destroyed_actors_queue.popleft()
253
+ if actor_id in DataSource.actors:
254
+ actor = DataSource.actors.pop(actor_id)
255
+ node_id = actor["address"].get("rayletId")
256
+ if node_id and node_id != actor_consts.NIL_NODE_ID:
257
+ del DataSource.node_actors[node_id][actor_id]
258
+ await asyncio.sleep(ACTOR_CLEANUP_FREQUENCY)
259
+ except Exception:
260
+ logger.exception("Error cleaning up actor info from GCS.")
261
+
262
+ @routes.get("/logical/actors")
263
+ @dashboard_optional_utils.aiohttp_cache
264
+ async def get_all_actors(self, req) -> aiohttp.web.Response:
265
+ actors = await DataOrganizer.get_actor_infos()
266
+ return dashboard_optional_utils.rest_response(
267
+ success=True,
268
+ message="All actors fetched.",
269
+ actors=actors,
270
+ # False to avoid converting Ray resource name to google style.
271
+ # It's not necessary here because the fields are already
272
+ # google formatted when protobuf was converted into dict.
273
+ convert_google_style=False,
274
+ )
275
+
276
+ @routes.get("/logical/actors/{actor_id}")
277
+ @dashboard_optional_utils.aiohttp_cache
278
+ async def get_actor(self, req) -> aiohttp.web.Response:
279
+ actor_id = req.match_info.get("actor_id")
280
+ actors = await DataOrganizer.get_actor_infos(actor_ids=[actor_id])
281
+ return dashboard_optional_utils.rest_response(
282
+ success=True, message="Actor details fetched.", detail=actors[actor_id]
283
+ )
284
+
285
+ async def run(self, server):
286
+ await asyncio.gather(self._update_actors(), self._cleanup_actors())
287
+
288
+ @staticmethod
289
+ def is_minimal_module():
290
+ return False
.venv/lib/python3.11/site-packages/ray/dashboard/modules/event/__init__.py ADDED
File without changes
.venv/lib/python3.11/site-packages/ray/dashboard/modules/event/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (200 Bytes). View file
 
.venv/lib/python3.11/site-packages/ray/dashboard/modules/event/__pycache__/event_agent.cpython-311.pyc ADDED
Binary file (8.09 kB). View file
 
.venv/lib/python3.11/site-packages/ray/dashboard/modules/event/__pycache__/event_consts.cpython-311.pyc ADDED
Binary file (1.09 kB). View file
 
.venv/lib/python3.11/site-packages/ray/dashboard/modules/event/__pycache__/event_head.cpython-311.pyc ADDED
Binary file (12.2 kB). View file
 
.venv/lib/python3.11/site-packages/ray/dashboard/modules/event/__pycache__/event_utils.cpython-311.pyc ADDED
Binary file (11.6 kB). View file
 
.venv/lib/python3.11/site-packages/ray/dashboard/modules/event/event_agent.py ADDED
@@ -0,0 +1,133 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import logging
3
+ import os
4
+ import time
5
+ from concurrent.futures import ThreadPoolExecutor
6
+ from typing import Union
7
+
8
+ import ray._private.ray_constants as ray_constants
9
+ import ray._private.utils as utils
10
+ import ray.dashboard.consts as dashboard_consts
11
+ import ray.dashboard.utils as dashboard_utils
12
+ from ray.core.generated import event_pb2, event_pb2_grpc
13
+ from ray.dashboard.modules.event import event_consts
14
+ from ray.dashboard.modules.event.event_utils import monitor_events
15
+ from ray.dashboard.utils import async_loop_forever, create_task
16
+
17
+ logger = logging.getLogger(__name__)
18
+
19
+
20
+ # NOTE: Executor in this head is intentionally constrained to just 1 thread by
21
+ # default to limit its concurrency, therefore reducing potential for
22
+ # GIL contention
23
+ RAY_DASHBOARD_EVENT_AGENT_TPE_MAX_WORKERS = ray_constants.env_integer(
24
+ "RAY_DASHBOARD_EVENT_AGENT_TPE_MAX_WORKERS", 1
25
+ )
26
+
27
+
28
+ class EventAgent(dashboard_utils.DashboardAgentModule):
29
+ def __init__(self, dashboard_agent):
30
+ super().__init__(dashboard_agent)
31
+ self._event_dir = os.path.join(self._dashboard_agent.log_dir, "events")
32
+ os.makedirs(self._event_dir, exist_ok=True)
33
+ self._monitor: Union[asyncio.Task, None] = None
34
+ self._stub: Union[event_pb2_grpc.ReportEventServiceStub, None] = None
35
+ self._cached_events = asyncio.Queue(event_consts.EVENT_AGENT_CACHE_SIZE)
36
+ self._gcs_aio_client = dashboard_agent.gcs_aio_client
37
+ # Total number of event created from this agent.
38
+ self.total_event_reported = 0
39
+ # Total number of event report request sent.
40
+ self.total_request_sent = 0
41
+ self.module_started = time.monotonic()
42
+
43
+ self._executor = ThreadPoolExecutor(
44
+ max_workers=RAY_DASHBOARD_EVENT_AGENT_TPE_MAX_WORKERS,
45
+ thread_name_prefix="event_agent_executor",
46
+ )
47
+
48
+ logger.info("Event agent cache buffer size: %s", self._cached_events.maxsize)
49
+
50
+ async def _connect_to_dashboard(self):
51
+ """Connect to the dashboard. If the dashboard is not started, then
52
+ this method will never returns.
53
+
54
+ Returns:
55
+ The ReportEventServiceStub object.
56
+ """
57
+ while True:
58
+ try:
59
+ dashboard_rpc_address = await self._gcs_aio_client.internal_kv_get(
60
+ dashboard_consts.DASHBOARD_RPC_ADDRESS.encode(),
61
+ namespace=ray_constants.KV_NAMESPACE_DASHBOARD,
62
+ timeout=1,
63
+ )
64
+ dashboard_rpc_address = dashboard_rpc_address.decode()
65
+ if dashboard_rpc_address:
66
+ logger.info("Report events to %s", dashboard_rpc_address)
67
+ options = ray_constants.GLOBAL_GRPC_OPTIONS
68
+ channel = utils.init_grpc_channel(
69
+ dashboard_rpc_address, options=options, asynchronous=True
70
+ )
71
+ return event_pb2_grpc.ReportEventServiceStub(channel)
72
+ except Exception:
73
+ logger.exception("Connect to dashboard failed.")
74
+ await asyncio.sleep(
75
+ event_consts.RETRY_CONNECT_TO_DASHBOARD_INTERVAL_SECONDS
76
+ )
77
+
78
+ @async_loop_forever(event_consts.EVENT_AGENT_REPORT_INTERVAL_SECONDS)
79
+ async def report_events(self):
80
+ """Report events from cached events queue. Reconnect to dashboard if
81
+ report failed. Log error after retry EVENT_AGENT_RETRY_TIMES.
82
+
83
+ This method will never returns.
84
+ """
85
+ data = await self._cached_events.get()
86
+ self.total_event_reported += len(data)
87
+ for _ in range(event_consts.EVENT_AGENT_RETRY_TIMES):
88
+ try:
89
+ logger.debug("Report %s events.", len(data))
90
+ request = event_pb2.ReportEventsRequest(event_strings=data)
91
+ await self._stub.ReportEvents(request)
92
+ self.total_request_sent += 1
93
+ break
94
+ except Exception:
95
+ logger.exception("Report event failed, reconnect to the " "dashboard.")
96
+ self._stub = await self._connect_to_dashboard()
97
+ else:
98
+ data_str = str(data)
99
+ limit = event_consts.LOG_ERROR_EVENT_STRING_LENGTH_LIMIT
100
+ logger.error(
101
+ "Report event failed: %s",
102
+ data_str[:limit] + (data_str[limit:] and "..."),
103
+ )
104
+
105
+ async def get_internal_states(self):
106
+ if self.total_event_reported <= 0 or self.total_request_sent <= 0:
107
+ return
108
+
109
+ elapsed = time.monotonic() - self.module_started
110
+ return {
111
+ "total_events_reported": self.total_event_reported,
112
+ "Total_report_request": self.total_request_sent,
113
+ "queue_size": self._cached_events.qsize(),
114
+ "total_uptime": elapsed,
115
+ }
116
+
117
+ async def run(self, server):
118
+ # Connect to dashboard.
119
+ self._stub = await self._connect_to_dashboard()
120
+ # Start monitor task.
121
+ self._monitor = monitor_events(
122
+ self._event_dir,
123
+ lambda data: create_task(self._cached_events.put(data)),
124
+ self._executor,
125
+ )
126
+
127
+ await asyncio.gather(
128
+ self.report_events(),
129
+ )
130
+
131
+ @staticmethod
132
+ def is_minimal_module():
133
+ return False
.venv/lib/python3.11/site-packages/ray/dashboard/modules/event/event_consts.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ray._private.ray_constants import env_float, env_integer
2
+ from ray.core.generated import event_pb2
3
+
4
+ LOG_ERROR_EVENT_STRING_LENGTH_LIMIT = 1000
5
+ RETRY_CONNECT_TO_DASHBOARD_INTERVAL_SECONDS = 2
6
+ # Monitor events
7
+ SCAN_EVENT_DIR_INTERVAL_SECONDS = env_integer("SCAN_EVENT_DIR_INTERVAL_SECONDS", 2)
8
+ SCAN_EVENT_START_OFFSET_SECONDS = -30 * 60
9
+ CONCURRENT_READ_LIMIT = 50
10
+ EVENT_READ_LINE_COUNT_LIMIT = 200
11
+ EVENT_READ_LINE_LENGTH_LIMIT = env_integer(
12
+ "EVENT_READ_LINE_LENGTH_LIMIT", 2 * 1024 * 1024
13
+ ) # 2MB
14
+ # Report events
15
+ EVENT_AGENT_REPORT_INTERVAL_SECONDS = env_float(
16
+ "EVENT_AGENT_REPORT_INTERVAL_SECONDS", 0.1
17
+ )
18
+ EVENT_AGENT_RETRY_TIMES = 10
19
+ EVENT_AGENT_CACHE_SIZE = 10240
20
+ # Event sources
21
+ EVENT_SOURCE_ALL = event_pb2.Event.SourceType.keys()
.venv/lib/python3.11/site-packages/ray/dashboard/modules/event/event_head.py ADDED
@@ -0,0 +1,212 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import asyncio
2
+ import logging
3
+ import os
4
+ import time
5
+ from collections import OrderedDict, defaultdict
6
+ from concurrent.futures import ThreadPoolExecutor
7
+ from datetime import datetime
8
+ from itertools import islice
9
+ from typing import Dict, Union
10
+
11
+ import aiohttp.web
12
+
13
+ import ray.dashboard.optional_utils as dashboard_optional_utils
14
+ import ray.dashboard.utils as dashboard_utils
15
+ from ray._private.ray_constants import env_integer
16
+ from ray._private.usage.usage_lib import TagKey, record_extra_usage_tag
17
+ from ray._private.utils import get_or_create_event_loop
18
+ from ray.core.generated import event_pb2, event_pb2_grpc
19
+ from ray.dashboard.consts import (
20
+ RAY_STATE_SERVER_MAX_HTTP_REQUEST,
21
+ RAY_STATE_SERVER_MAX_HTTP_REQUEST_ALLOWED,
22
+ RAY_STATE_SERVER_MAX_HTTP_REQUEST_ENV_NAME,
23
+ )
24
+ from ray.dashboard.modules.event.event_utils import monitor_events, parse_event_strings
25
+ from ray.dashboard.state_api_utils import do_filter, handle_list_api
26
+ from ray.util.state.common import ClusterEventState, ListApiOptions, ListApiResponse
27
+
28
+ logger = logging.getLogger(__name__)
29
+ routes = dashboard_optional_utils.DashboardHeadRouteTable
30
+
31
+ JobEvents = OrderedDict
32
+ dashboard_utils._json_compatible_types.add(JobEvents)
33
+
34
+ MAX_EVENTS_TO_CACHE = int(os.environ.get("RAY_DASHBOARD_MAX_EVENTS_TO_CACHE", 10000))
35
+
36
+ # NOTE: Executor in this head is intentionally constrained to just 1 thread by
37
+ # default to limit its concurrency, therefore reducing potential for
38
+ # GIL contention
39
+ RAY_DASHBOARD_EVENT_HEAD_TPE_MAX_WORKERS = env_integer(
40
+ "RAY_DASHBOARD_EVENT_HEAD_TPE_MAX_WORKERS", 1
41
+ )
42
+
43
+
44
+ async def _list_cluster_events_impl(
45
+ *, all_events, executor: ThreadPoolExecutor, option: ListApiOptions
46
+ ) -> ListApiResponse:
47
+ """
48
+ List all cluster events from the cluster. Made a free function to allow unit tests.
49
+
50
+ Returns:
51
+ A list of cluster events in the cluster.
52
+ The schema of returned "dict" is equivalent to the
53
+ `ClusterEventState` protobuf message.
54
+ """
55
+
56
+ def transform(all_events) -> ListApiResponse:
57
+ result = []
58
+ for _, events in all_events.items():
59
+ for _, event in events.items():
60
+ event["time"] = str(datetime.fromtimestamp(int(event["timestamp"])))
61
+ result.append(event)
62
+
63
+ num_after_truncation = len(result)
64
+ result.sort(key=lambda entry: entry["timestamp"])
65
+ total = len(result)
66
+ result = do_filter(result, option.filters, ClusterEventState, option.detail)
67
+ num_filtered = len(result)
68
+ # Sort to make the output deterministic.
69
+ result = list(islice(result, option.limit))
70
+ return ListApiResponse(
71
+ result=result,
72
+ total=total,
73
+ num_after_truncation=num_after_truncation,
74
+ num_filtered=num_filtered,
75
+ )
76
+
77
+ return await get_or_create_event_loop().run_in_executor(
78
+ executor, transform, all_events
79
+ )
80
+
81
+
82
+ class EventHead(
83
+ dashboard_utils.DashboardHeadModule,
84
+ dashboard_utils.RateLimitedModule,
85
+ event_pb2_grpc.ReportEventServiceServicer,
86
+ ):
87
+ def __init__(self, config: dashboard_utils.DashboardHeadModuleConfig):
88
+ dashboard_utils.DashboardHeadModule.__init__(self, config)
89
+ dashboard_utils.RateLimitedModule.__init__(
90
+ self,
91
+ min(
92
+ RAY_STATE_SERVER_MAX_HTTP_REQUEST,
93
+ RAY_STATE_SERVER_MAX_HTTP_REQUEST_ALLOWED,
94
+ ),
95
+ )
96
+ self._event_dir = os.path.join(self.log_dir, "events")
97
+ os.makedirs(self._event_dir, exist_ok=True)
98
+ self._monitor: Union[asyncio.Task, None] = None
99
+ self.total_report_events_count = 0
100
+ self.total_events_received = 0
101
+ self.module_started = time.monotonic()
102
+ # {job_id hex(str): {event_id (str): event (dict)}}
103
+ self.events: Dict[str, JobEvents] = defaultdict(JobEvents)
104
+
105
+ self._executor = ThreadPoolExecutor(
106
+ max_workers=RAY_DASHBOARD_EVENT_HEAD_TPE_MAX_WORKERS,
107
+ thread_name_prefix="event_head_executor",
108
+ )
109
+
110
+ async def limit_handler_(self):
111
+ return dashboard_optional_utils.rest_response(
112
+ success=False,
113
+ error_message=(
114
+ "Max number of in-progress requests="
115
+ f"{self.max_num_call_} reached. "
116
+ "To set a higher limit, set environment variable: "
117
+ f"export {RAY_STATE_SERVER_MAX_HTTP_REQUEST_ENV_NAME}='xxx'. "
118
+ f"Max allowed = {RAY_STATE_SERVER_MAX_HTTP_REQUEST_ALLOWED}"
119
+ ),
120
+ result=None,
121
+ )
122
+
123
+ def _update_events(self, event_list):
124
+ # {job_id: {event_id: event}}
125
+ all_job_events = defaultdict(JobEvents)
126
+ for event in event_list:
127
+ event_id = event["event_id"]
128
+ custom_fields = event.get("custom_fields")
129
+ system_event = False
130
+ if custom_fields:
131
+ job_id = custom_fields.get("job_id", "global") or "global"
132
+ else:
133
+ job_id = "global"
134
+ if system_event is False:
135
+ all_job_events[job_id][event_id] = event
136
+
137
+ for job_id, new_job_events in all_job_events.items():
138
+ job_events = self.events[job_id]
139
+ job_events.update(new_job_events)
140
+
141
+ # Limit the # of events cached if it exceeds the threshold.
142
+ if len(job_events) > MAX_EVENTS_TO_CACHE * 1.1:
143
+ while len(job_events) > MAX_EVENTS_TO_CACHE:
144
+ job_events.popitem(last=False)
145
+
146
+ async def ReportEvents(self, request, context):
147
+ received_events = []
148
+ if request.event_strings:
149
+ received_events.extend(parse_event_strings(request.event_strings))
150
+ logger.debug("Received %d events", len(received_events))
151
+ self._update_events(received_events)
152
+ self.total_report_events_count += 1
153
+ self.total_events_received += len(received_events)
154
+ return event_pb2.ReportEventsReply(send_success=True)
155
+
156
+ async def _periodic_state_print(self):
157
+ if self.total_events_received <= 0 or self.total_report_events_count <= 0:
158
+ return
159
+
160
+ elapsed = time.monotonic() - self.module_started
161
+ return {
162
+ "total_events_received": self.total_events_received,
163
+ "Total_requests_received": self.total_report_events_count,
164
+ "total_uptime": elapsed,
165
+ }
166
+
167
+ @routes.get("/events")
168
+ @dashboard_optional_utils.aiohttp_cache
169
+ async def get_event(self, req) -> aiohttp.web.Response:
170
+ job_id = req.query.get("job_id")
171
+ if job_id is None:
172
+ all_events = {
173
+ job_id: list(job_events.values())
174
+ for job_id, job_events in self.events.items()
175
+ }
176
+ return dashboard_optional_utils.rest_response(
177
+ success=True, message="All events fetched.", events=all_events
178
+ )
179
+
180
+ job_events = self.events[job_id]
181
+ return dashboard_optional_utils.rest_response(
182
+ success=True,
183
+ message="Job events fetched.",
184
+ job_id=job_id,
185
+ events=list(job_events.values()),
186
+ )
187
+
188
+ @routes.get("/api/v0/cluster_events")
189
+ @dashboard_utils.RateLimitedModule.enforce_max_concurrent_calls
190
+ async def list_cluster_events(
191
+ self, req: aiohttp.web.Request
192
+ ) -> aiohttp.web.Response:
193
+ record_extra_usage_tag(TagKey.CORE_STATE_API_LIST_CLUSTER_EVENTS, "1")
194
+
195
+ async def list_api_fn(option: ListApiOptions):
196
+ return await _list_cluster_events_impl(
197
+ all_events=self.events, executor=self._executor, option=option
198
+ )
199
+
200
+ return await handle_list_api(list_api_fn, req)
201
+
202
+ async def run(self, server):
203
+ event_pb2_grpc.add_ReportEventServiceServicer_to_server(self, server)
204
+ self._monitor = monitor_events(
205
+ self._event_dir,
206
+ lambda data: self._update_events(parse_event_strings(data)),
207
+ self._executor,
208
+ )
209
+
210
+ @staticmethod
211
+ def is_minimal_module():
212
+ return False
.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/__init__.py ADDED
File without changes
.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (202 Bytes). View file
 
.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/__pycache__/grafana_dashboard_factory.cpython-311.pyc ADDED
Binary file (10.6 kB). View file
 
.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/__pycache__/metrics_head.cpython-311.pyc ADDED
Binary file (24.4 kB). View file
 
.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/__pycache__/templates.cpython-311.pyc ADDED
Binary file (1.59 kB). View file
 
.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/dashboards/common.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+ from typing import List, Optional
3
+
4
+
5
+ @dataclass
6
+ class GridPos:
7
+ x: int
8
+ y: int
9
+ w: int
10
+ h: int
11
+
12
+
13
+ @dataclass
14
+ class Target:
15
+ """Defines a Grafana target (time-series query) within a panel.
16
+
17
+ A panel will have one or more targets. By default, all targets are rendered as
18
+ stacked area charts, with the exception of legend="MAX", which is rendered as
19
+ a blue dotted line. Any legend="FINISHED|FAILED|DEAD|REMOVED" series will also be
20
+ rendered hidden by default.
21
+
22
+ Attributes:
23
+ expr: The prometheus query to evaluate.
24
+ legend: The legend string to format for each time-series.
25
+ """
26
+
27
+ expr: str
28
+ legend: str
29
+
30
+
31
+ @dataclass
32
+ class Panel:
33
+ """Defines a Grafana panel (graph) for the Ray dashboard page.
34
+
35
+ A panel contains one or more targets (time-series queries).
36
+
37
+ Attributes:
38
+ title: Short name of the graph. Note: please keep this in sync with the title
39
+ definitions in Metrics.tsx.
40
+ description: Long form description of the graph.
41
+ id: Integer id used to reference the graph from Metrics.tsx.
42
+ unit: The unit to display on the y-axis of the graph.
43
+ targets: List of query targets.
44
+ fill: Whether or not the graph will be filled by a color.
45
+ stack: Whether or not the lines in the graph will be stacked.
46
+ """
47
+
48
+ title: str
49
+ description: str
50
+ id: int
51
+ unit: str
52
+ targets: List[Target]
53
+ fill: int = 10
54
+ stack: bool = True
55
+ linewidth: int = 1
56
+ grid_pos: Optional[GridPos] = None
57
+
58
+
59
+ @dataclass
60
+ class DashboardConfig:
61
+ # This dashboard name is an internal key used to determine which env vars
62
+ # to check for customization
63
+ name: str
64
+ # The uid of the dashboard json if not overridden by a user
65
+ default_uid: str
66
+ panels: List[Panel]
67
+ # The global filters applied to all graphs in this dashboard. Users can
68
+ # add additional global_filters on top of this.
69
+ standard_global_filters: List[str]
70
+ base_json_file_name: str
.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/dashboards/data_dashboard_panels.py ADDED
@@ -0,0 +1,551 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ruff: noqa: E501
2
+
3
+ from ray.dashboard.modules.metrics.dashboards.common import (
4
+ DashboardConfig,
5
+ Panel,
6
+ Target,
7
+ )
8
+
9
+ # When adding a new panels for an OpRuntimeMetric, follow this format:
10
+ # Panel(
11
+ # title=title,
12
+ # description=metric.metadata.get("description"),
13
+ # id=panel_id,
14
+ # unit=unit,
15
+ # targets=[
16
+ # Target(
17
+ # expr=f"sum(ray_data_{metric.name}"
18
+ # + "{{{global_filters}}}) by (dataset, operator)",
19
+ # legend=legend,
20
+ # )
21
+ # ],
22
+ # fill=fill,
23
+ # stack=stack,
24
+ # )
25
+
26
+
27
+ DATA_GRAFANA_PANELS = [
28
+ # Ray Data Metrics (Overview)
29
+ Panel(
30
+ id=1,
31
+ title="Bytes Spilled",
32
+ description="Amount spilled by dataset operators. DataContext.enable_get_object_locations_for_metrics must be set to True to report this metric",
33
+ unit="bytes",
34
+ targets=[
35
+ Target(
36
+ expr="sum(ray_data_spilled_bytes{{{global_filters}}}) by (dataset, operator)",
37
+ legend="Bytes Spilled: {{dataset}}, {{operator}}",
38
+ )
39
+ ],
40
+ fill=0,
41
+ stack=False,
42
+ ),
43
+ Panel(
44
+ id=2,
45
+ title="Bytes Allocated",
46
+ description="Amount allocated by dataset operators.",
47
+ unit="bytes",
48
+ targets=[
49
+ Target(
50
+ expr="sum(ray_data_allocated_bytes{{{global_filters}}}) by (dataset, operator)",
51
+ legend="Bytes Allocated: {{dataset}}, {{operator}}",
52
+ )
53
+ ],
54
+ fill=0,
55
+ stack=False,
56
+ ),
57
+ Panel(
58
+ id=3,
59
+ title="Bytes Freed",
60
+ description="Amount freed by dataset operators.",
61
+ unit="bytes",
62
+ targets=[
63
+ Target(
64
+ expr="sum(ray_data_freed_bytes{{{global_filters}}}) by (dataset, operator)",
65
+ legend="Bytes Freed: {{dataset}}, {{operator}}",
66
+ )
67
+ ],
68
+ fill=0,
69
+ stack=False,
70
+ ),
71
+ Panel(
72
+ id=4,
73
+ title="Object Store Memory",
74
+ description="Amount of memory store used by dataset operators.",
75
+ unit="bytes",
76
+ targets=[
77
+ Target(
78
+ expr="sum(ray_data_current_bytes{{{global_filters}}}) by (dataset, operator)",
79
+ legend="Current Usage: {{dataset}}, {{operator}}",
80
+ )
81
+ ],
82
+ fill=0,
83
+ stack=False,
84
+ ),
85
+ Panel(
86
+ id=5,
87
+ title="CPUs (logical slots)",
88
+ description="Logical CPUs allocated to dataset operators.",
89
+ unit="cores",
90
+ targets=[
91
+ Target(
92
+ expr="sum(ray_data_cpu_usage_cores{{{global_filters}}}) by (dataset, operator)",
93
+ legend="CPU Usage: {{dataset}}, {{operator}}",
94
+ )
95
+ ],
96
+ fill=0,
97
+ stack=False,
98
+ ),
99
+ Panel(
100
+ id=6,
101
+ title="GPUs (logical slots)",
102
+ description="Logical GPUs allocated to dataset operators.",
103
+ unit="cores",
104
+ targets=[
105
+ Target(
106
+ expr="sum(ray_data_gpu_usage_cores{{{global_filters}}}) by (dataset, operator)",
107
+ legend="GPU Usage: {{dataset}}, {{operator}}",
108
+ )
109
+ ],
110
+ fill=0,
111
+ stack=False,
112
+ ),
113
+ Panel(
114
+ id=7,
115
+ title="Bytes Output / Second",
116
+ description="Bytes output per second by dataset operators.",
117
+ unit="Bps",
118
+ targets=[
119
+ Target(
120
+ expr="sum(rate(ray_data_output_bytes{{{global_filters}}}[1m])) by (dataset, operator)",
121
+ legend="Bytes Output / Second: {{dataset}}, {{operator}}",
122
+ )
123
+ ],
124
+ fill=0,
125
+ stack=False,
126
+ ),
127
+ Panel(
128
+ id=11,
129
+ title="Rows Output / Second",
130
+ description="Total rows output per second by dataset operators.",
131
+ unit="rows/sec",
132
+ targets=[
133
+ Target(
134
+ expr="sum(rate(ray_data_output_rows{{{global_filters}}}[1m])) by (dataset, operator)",
135
+ legend="Rows Output / Second: {{dataset}}, {{operator}}",
136
+ )
137
+ ],
138
+ fill=0,
139
+ stack=False,
140
+ ),
141
+ # Ray Data Metrics (Inputs)
142
+ Panel(
143
+ id=17,
144
+ title="Input Blocks Received by Operator / Second",
145
+ description="Number of input blocks received by operator per second.",
146
+ unit="blocks/sec",
147
+ targets=[
148
+ Target(
149
+ expr="sum(rate(ray_data_num_inputs_received{{{global_filters}}}[1m])) by (dataset, operator)",
150
+ legend="Blocks Received / Second: {{dataset}}, {{operator}}",
151
+ )
152
+ ],
153
+ fill=0,
154
+ stack=False,
155
+ ),
156
+ Panel(
157
+ id=18,
158
+ title="Input Bytes Received by Operator / Second",
159
+ description="Byte size of input blocks received by operator per second.",
160
+ unit="Bps",
161
+ targets=[
162
+ Target(
163
+ expr="sum(rate(ray_data_bytes_inputs_received{{{global_filters}}}[1m])) by (dataset, operator)",
164
+ legend="Bytes Received / Second: {{dataset}}, {{operator}}",
165
+ )
166
+ ],
167
+ fill=0,
168
+ stack=False,
169
+ ),
170
+ Panel(
171
+ id=19,
172
+ title="Input Blocks Processed by Tasks / Second",
173
+ description=(
174
+ "Number of input blocks that operator's tasks have finished processing per second."
175
+ ),
176
+ unit="blocks/sec",
177
+ targets=[
178
+ Target(
179
+ expr="sum(rate(ray_data_num_task_inputs_processed{{{global_filters}}}[1m])) by (dataset, operator)",
180
+ legend="Blocks Processed / Second: {{dataset}}, {{operator}}",
181
+ )
182
+ ],
183
+ fill=0,
184
+ stack=False,
185
+ ),
186
+ Panel(
187
+ id=20,
188
+ title="Input Bytes Processed by Tasks / Second",
189
+ description=(
190
+ "Byte size of input blocks that operator's tasks have finished processing per second."
191
+ ),
192
+ unit="Bps",
193
+ targets=[
194
+ Target(
195
+ expr="sum(rate(ray_data_bytes_task_inputs_processed{{{global_filters}}}[1m])) by (dataset, operator)",
196
+ legend="Bytes Processed / Second: {{dataset}}, {{operator}}",
197
+ )
198
+ ],
199
+ fill=0,
200
+ stack=False,
201
+ ),
202
+ Panel(
203
+ id=21,
204
+ title="Input Bytes Submitted to Tasks / Second",
205
+ description="Byte size of input blocks passed to submitted tasks per second.",
206
+ unit="Bps",
207
+ targets=[
208
+ Target(
209
+ expr="sum(rate(ray_data_bytes_inputs_of_submitted_tasks{{{global_filters}}}[1m])) by (dataset, operator)",
210
+ legend="Bytes Submitted / Second: {{dataset}}, {{operator}}",
211
+ )
212
+ ],
213
+ fill=0,
214
+ stack=False,
215
+ ),
216
+ Panel(
217
+ id=22,
218
+ title="Blocks Generated by Tasks / Second",
219
+ description="Number of output blocks generated by tasks per second.",
220
+ unit="blocks/sec",
221
+ targets=[
222
+ Target(
223
+ expr="sum(rate(ray_data_num_task_outputs_generated{{{global_filters}}}[1m])) by (dataset, operator)",
224
+ legend="Blocks Generated / Second: {{dataset}}, {{operator}}",
225
+ )
226
+ ],
227
+ fill=0,
228
+ stack=False,
229
+ ),
230
+ Panel(
231
+ id=23,
232
+ title="Bytes Generated by Tasks / Second",
233
+ description="Byte size of output blocks generated by tasks per second.",
234
+ unit="Bps",
235
+ targets=[
236
+ Target(
237
+ expr="sum(rate(ray_data_bytes_task_outputs_generated{{{global_filters}}}[1m])) by (dataset, operator)",
238
+ legend="Bytes Generated / Second: {{dataset}}, {{operator}}",
239
+ )
240
+ ],
241
+ fill=0,
242
+ stack=False,
243
+ ),
244
+ Panel(
245
+ id=24,
246
+ title="Rows Generated by Tasks / Second",
247
+ description="Number of rows in generated output blocks from finished tasks per second.",
248
+ unit="rows/sec",
249
+ targets=[
250
+ Target(
251
+ expr="sum(rate(ray_data_rows_task_outputs_generated{{{global_filters}}}[1m])) by (dataset, operator)",
252
+ legend="Rows Generated / Second: {{dataset}}, {{operator}}",
253
+ )
254
+ ],
255
+ fill=0,
256
+ stack=False,
257
+ ),
258
+ Panel(
259
+ id=25,
260
+ title="Output Blocks Taken by Downstream Operators / Second",
261
+ description="Number of output blocks taken by downstream operators per second.",
262
+ unit="blocks/sec",
263
+ targets=[
264
+ Target(
265
+ expr="sum(rate(ray_data_num_outputs_taken{{{global_filters}}}[1m])) by (dataset, operator)",
266
+ legend="Blocks Taken / Second: {{dataset}}, {{operator}}",
267
+ )
268
+ ],
269
+ fill=0,
270
+ stack=False,
271
+ ),
272
+ Panel(
273
+ id=26,
274
+ title="Output Bytes Taken by Downstream Operators / Second",
275
+ description=(
276
+ "Byte size of output blocks taken by downstream operators per second."
277
+ ),
278
+ unit="Bps",
279
+ targets=[
280
+ Target(
281
+ expr="sum(rate(ray_data_bytes_outputs_taken{{{global_filters}}}[1m])) by (dataset, operator)",
282
+ legend="Bytes Taken / Second: {{dataset}}, {{operator}}",
283
+ )
284
+ ],
285
+ fill=0,
286
+ stack=False,
287
+ ),
288
+ # Ray Data Metrics (Tasks)
289
+ Panel(
290
+ id=29,
291
+ title="Submitted Tasks",
292
+ description="Number of submitted tasks.",
293
+ unit="tasks",
294
+ targets=[
295
+ Target(
296
+ expr="sum(ray_data_num_tasks_submitted{{{global_filters}}}) by (dataset, operator)",
297
+ legend="Submitted Tasks: {{dataset}}, {{operator}}",
298
+ )
299
+ ],
300
+ fill=0,
301
+ stack=False,
302
+ ),
303
+ Panel(
304
+ id=30,
305
+ title="Running Tasks",
306
+ description="Number of running tasks.",
307
+ unit="tasks",
308
+ targets=[
309
+ Target(
310
+ expr="sum(ray_data_num_tasks_running{{{global_filters}}}) by (dataset, operator)",
311
+ legend="Running Tasks: {{dataset}}, {{operator}}",
312
+ )
313
+ ],
314
+ fill=0,
315
+ stack=False,
316
+ ),
317
+ Panel(
318
+ id=31,
319
+ title="Tasks with output blocks",
320
+ description="Number of tasks that already have output.",
321
+ unit="tasks",
322
+ targets=[
323
+ Target(
324
+ expr="sum(ray_data_num_tasks_have_outputs{{{global_filters}}}) by (dataset, operator)",
325
+ legend="Tasks with output blocks: {{dataset}}, {{operator}}",
326
+ )
327
+ ],
328
+ fill=0,
329
+ stack=False,
330
+ ),
331
+ Panel(
332
+ id=32,
333
+ title="Finished Tasks",
334
+ description="Number of finished tasks.",
335
+ unit="tasks",
336
+ targets=[
337
+ Target(
338
+ expr="sum(ray_data_num_tasks_finished{{{global_filters}}}) by (dataset, operator)",
339
+ legend="Finished Tasks: {{dataset}}, {{operator}}",
340
+ )
341
+ ],
342
+ fill=0,
343
+ stack=False,
344
+ ),
345
+ Panel(
346
+ id=33,
347
+ title="Failed Tasks",
348
+ description="Number of failed tasks.",
349
+ unit="tasks",
350
+ targets=[
351
+ Target(
352
+ expr="sum(ray_data_num_tasks_failed{{{global_filters}}}) by (dataset, operator)",
353
+ legend="Failed Tasks: {{dataset}}, {{operator}}",
354
+ )
355
+ ],
356
+ fill=0,
357
+ stack=False,
358
+ ),
359
+ Panel(
360
+ id=8,
361
+ title="Block Generation Time",
362
+ description="Time spent generating blocks in tasks.",
363
+ unit="seconds",
364
+ targets=[
365
+ Target(
366
+ expr="sum(ray_data_block_generation_time{{{global_filters}}}) by (dataset, operator)",
367
+ legend="Block Generation Time: {{dataset}}, {{operator}}",
368
+ )
369
+ ],
370
+ fill=0,
371
+ stack=False,
372
+ ),
373
+ Panel(
374
+ id=37,
375
+ title="Task Submission Backpressure Time",
376
+ description="Time spent in task submission backpressure.",
377
+ unit="seconds",
378
+ targets=[
379
+ Target(
380
+ expr="sum(ray_data_task_submission_backpressure_time{{{global_filters}}}) by (dataset, operator)",
381
+ legend="Backpressure Time: {{dataset}}, {{operator}}",
382
+ )
383
+ ],
384
+ fill=0,
385
+ stack=True,
386
+ ),
387
+ # Ray Data Metrics (Object Store Memory)
388
+ Panel(
389
+ id=13,
390
+ title="Operator Internal Inqueue Size (Blocks)",
391
+ description="Number of blocks in operator's internal input queue",
392
+ unit="blocks",
393
+ targets=[
394
+ Target(
395
+ expr="sum(ray_data_obj_store_mem_internal_inqueue_blocks{{{global_filters}}}) by (dataset, operator)",
396
+ legend="Number of Blocks: {{dataset}}, {{operator}}",
397
+ )
398
+ ],
399
+ fill=0,
400
+ stack=False,
401
+ ),
402
+ Panel(
403
+ id=14,
404
+ title="Operator Internal Inqueue Size (Bytes)",
405
+ description="Byte size of input blocks in the operator's internal input queue.",
406
+ unit="bytes",
407
+ targets=[
408
+ Target(
409
+ expr="sum(ray_data_obj_store_mem_internal_inqueue{{{global_filters}}}) by (dataset, operator)",
410
+ legend="Bytes Size: {{dataset}}, {{operator}}",
411
+ )
412
+ ],
413
+ fill=0,
414
+ stack=True,
415
+ ),
416
+ Panel(
417
+ id=15,
418
+ title="Operator Internal Outqueue Size (Blocks)",
419
+ description="Number of blocks in operator's internal output queue",
420
+ unit="blocks",
421
+ targets=[
422
+ Target(
423
+ expr="sum(ray_data_obj_store_mem_internal_outqueue_blocks{{{global_filters}}}) by (dataset, operator)",
424
+ legend="Number of Blocks: {{dataset}}, {{operator}}",
425
+ )
426
+ ],
427
+ fill=0,
428
+ stack=False,
429
+ ),
430
+ Panel(
431
+ id=16,
432
+ title="Operator Internal Outqueue Size (Bytes)",
433
+ description=(
434
+ "Byte size of output blocks in the operator's internal output queue."
435
+ ),
436
+ unit="bytes",
437
+ targets=[
438
+ Target(
439
+ expr="sum(ray_data_obj_store_mem_internal_outqueue{{{global_filters}}}) by (dataset, operator)",
440
+ legend="Bytes Size: {{dataset}}, {{operator}}",
441
+ )
442
+ ],
443
+ fill=0,
444
+ stack=True,
445
+ ),
446
+ Panel(
447
+ id=34,
448
+ title="Size of Blocks used in Pending Tasks (Bytes)",
449
+ description="Byte size of input blocks used by pending tasks.",
450
+ unit="bytes",
451
+ targets=[
452
+ Target(
453
+ expr="sum(ray_data_obj_store_mem_pending_task_inputs{{{global_filters}}}) by (dataset, operator)",
454
+ legend="Bytes Size: {{dataset}}, {{operator}}",
455
+ )
456
+ ],
457
+ fill=0,
458
+ stack=True,
459
+ ),
460
+ Panel(
461
+ id=35,
462
+ title="Freed Memory in Object Store (Bytes)",
463
+ description="Byte size of freed memory in object store.",
464
+ unit="bytes",
465
+ targets=[
466
+ Target(
467
+ expr="sum(ray_data_obj_store_mem_freed{{{global_filters}}}) by (dataset, operator)",
468
+ legend="Bytes Size: {{dataset}}, {{operator}}",
469
+ )
470
+ ],
471
+ fill=0,
472
+ stack=True,
473
+ ),
474
+ Panel(
475
+ id=36,
476
+ title="Spilled Memory in Object Store (Bytes)",
477
+ description="Byte size of spilled memory in object store.",
478
+ unit="bytes",
479
+ targets=[
480
+ Target(
481
+ expr="sum(ray_data_obj_store_mem_spilled{{{global_filters}}}) by (dataset, operator)",
482
+ legend="Bytes Size: {{dataset}}, {{operator}}",
483
+ )
484
+ ],
485
+ fill=0,
486
+ stack=True,
487
+ ),
488
+ # Ray Data Metrics (Iteration)
489
+ Panel(
490
+ id=12,
491
+ title="Iteration Initialization Time",
492
+ description="Seconds spent in iterator initialization code",
493
+ unit="seconds",
494
+ targets=[
495
+ Target(
496
+ expr="sum(ray_data_iter_initialize_seconds{{{global_filters}}}) by (dataset)",
497
+ legend="Seconds: {{dataset}}, {{operator}}",
498
+ )
499
+ ],
500
+ fill=0,
501
+ stack=False,
502
+ ),
503
+ Panel(
504
+ id=9,
505
+ title="Iteration Blocked Time",
506
+ description="Seconds user thread is blocked by iter_batches()",
507
+ unit="seconds",
508
+ targets=[
509
+ Target(
510
+ expr="sum(ray_data_iter_total_blocked_seconds{{{global_filters}}}) by (dataset)",
511
+ legend="Seconds: {{dataset}}",
512
+ )
513
+ ],
514
+ fill=0,
515
+ stack=False,
516
+ ),
517
+ Panel(
518
+ id=10,
519
+ title="Iteration User Time",
520
+ description="Seconds spent in user code",
521
+ unit="seconds",
522
+ targets=[
523
+ Target(
524
+ expr="sum(ray_data_iter_user_seconds{{{global_filters}}}) by (dataset)",
525
+ legend="Seconds: {{dataset}}",
526
+ )
527
+ ],
528
+ fill=0,
529
+ stack=False,
530
+ ),
531
+ # Ray Data Metrics (Miscellaneous)
532
+ ]
533
+
534
+ ids = []
535
+ for panel in DATA_GRAFANA_PANELS:
536
+ ids.append(panel.id)
537
+ assert len(ids) == len(
538
+ set(ids)
539
+ ), f"Duplicated id found. Use unique id for each panel. {ids}"
540
+
541
+ data_dashboard_config = DashboardConfig(
542
+ name="DATA",
543
+ default_uid="rayDataDashboard",
544
+ panels=DATA_GRAFANA_PANELS,
545
+ standard_global_filters=[
546
+ 'dataset=~"$DatasetID"',
547
+ 'SessionName=~"$SessionName"',
548
+ 'ray_io_cluster=~"$Cluster"',
549
+ ],
550
+ base_json_file_name="data_grafana_dashboard_base.json",
551
+ )
.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/dashboards/default_dashboard_panels.py ADDED
@@ -0,0 +1,478 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ruff: noqa: E501
2
+
3
+ from ray.dashboard.modules.metrics.dashboards.common import (
4
+ DashboardConfig,
5
+ Panel,
6
+ Target,
7
+ )
8
+
9
+ """
10
+ Queries for autoscaler resources.
11
+ """
12
+ # Note: MAX & USED resources are reported from raylet to provide the most up to date information.
13
+ # But MAX + PENDING data is coming from the autoscaler. That said, MAX + PENDING can be
14
+ # more outdated. it is harmless because the actual MAX will catch up with MAX + PENDING
15
+ # eventually.
16
+ MAX_CPUS = 'sum(autoscaler_cluster_resources{{resource="CPU",{global_filters}}})'
17
+ PENDING_CPUS = 'sum(autoscaler_pending_resources{{resource="CPU",{global_filters}}})'
18
+ MAX_GPUS = 'sum(autoscaler_cluster_resources{{resource="GPU",{global_filters}}})'
19
+ PENDING_GPUS = 'sum(autoscaler_pending_resources{{resource="GPU",{global_filters}}})'
20
+
21
+
22
+ def max_plus_pending(max_resource, pending_resource):
23
+ return f"({max_resource} or vector(0)) + ({pending_resource} or vector(0))"
24
+
25
+
26
+ MAX_PLUS_PENDING_CPUS = max_plus_pending(MAX_CPUS, PENDING_CPUS)
27
+ MAX_PLUS_PENDING_GPUS = max_plus_pending(MAX_GPUS, PENDING_GPUS)
28
+
29
+
30
+ # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
31
+ # IMPORTANT: Please keep this in sync with Metrics.tsx and ray-metrics.rst
32
+ # !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
33
+ DEFAULT_GRAFANA_PANELS = [
34
+ Panel(
35
+ id=26,
36
+ title="Scheduler Task State",
37
+ description="Current number of tasks in a particular state.\n\nState: the task state, as described by rpc::TaskState proto in common.proto. Task resubmissions due to failures or object reconstruction are shown with (retry) in the label.",
38
+ unit="tasks",
39
+ targets=[
40
+ Target(
41
+ expr='sum(max_over_time(ray_tasks{{IsRetry="0",State=~"FINISHED|FAILED",instance=~"$Instance",{global_filters}}}[14d])) by (State) or clamp_min(sum(ray_tasks{{IsRetry="0",State!~"FINISHED|FAILED",instance=~"$Instance",{global_filters}}}) by (State), 0)',
42
+ legend="{{State}}",
43
+ ),
44
+ Target(
45
+ expr='sum(max_over_time(ray_tasks{{IsRetry!="0",State=~"FINISHED|FAILED",instance=~"$Instance",{global_filters}}}[14d])) by (State) or clamp_min(sum(ray_tasks{{IsRetry!="0",State!~"FINISHED|FAILED",instance=~"$Instance",{global_filters}}}) by (State), 0)',
46
+ legend="{{State}} (retry)",
47
+ ),
48
+ ],
49
+ fill=0,
50
+ stack=False,
51
+ ),
52
+ Panel(
53
+ id=35,
54
+ title="Requested Live Tasks by Name",
55
+ description="Current number of (live) tasks with a particular name. Task resubmissions due to failures or object reconstruction are shown with (retry) in the label.",
56
+ unit="tasks",
57
+ targets=[
58
+ Target(
59
+ expr='clamp_min(sum(ray_tasks{{IsRetry="0",State!~"FINISHED|FAILED",instance=~"$Instance",{global_filters}}}) by (Name), 0)',
60
+ legend="{{Name}}",
61
+ ),
62
+ Target(
63
+ expr='clamp_min(sum(ray_tasks{{IsRetry!="0",State!~"FINISHED|FAILED",instance=~"$Instance",{global_filters}}}) by (Name), 0)',
64
+ legend="{{Name}} (retry)",
65
+ ),
66
+ ],
67
+ fill=0,
68
+ stack=False,
69
+ ),
70
+ Panel(
71
+ id=38,
72
+ title="Running Tasks by Name",
73
+ description="Current number of (running) tasks with a particular name. Task resubmissions due to failures or object reconstruction are shown with (retry) in the label.",
74
+ unit="tasks",
75
+ targets=[
76
+ Target(
77
+ expr='clamp_min(sum(ray_tasks{{IsRetry="0",State=~"RUNNING*",instance=~"$Instance",{global_filters}}}) by (Name), 0)',
78
+ legend="{{Name}}",
79
+ ),
80
+ Target(
81
+ expr='clamp_min(sum(ray_tasks{{IsRetry!="0",State=~"RUNNING*",instance=~"$Instance",{global_filters}}}) by (Name), 0)',
82
+ legend="{{Name}} (retry)",
83
+ ),
84
+ ],
85
+ fill=0,
86
+ stack=False,
87
+ ),
88
+ Panel(
89
+ id=33,
90
+ title="Scheduler Actor State",
91
+ description='Note: not impacted by "Instance" variable.\n\nCurrent number of actors in a particular state.\n\nState: the actor state, as described by rpc::ActorTableData proto in gcs.proto.',
92
+ unit="actors",
93
+ targets=[
94
+ Target(
95
+ expr='sum(ray_actors{{Source="gcs",{global_filters}}}) by (State)',
96
+ legend="{{State}}",
97
+ )
98
+ ],
99
+ ),
100
+ Panel(
101
+ id=42,
102
+ title="Live Actor State",
103
+ description="Current number of alive actors in a particular state.\n\nState: IDLE, RUNNING_TASK, RUNNING_IN_RAY_GET, RUNNING_IN_RAY_WAIT",
104
+ unit="actors",
105
+ targets=[
106
+ Target(
107
+ expr='sum(ray_actors{{Source="executor",NodeAddress=~"$Instance",{global_filters}}}) by (State)',
108
+ legend="{{State}}",
109
+ )
110
+ ],
111
+ ),
112
+ Panel(
113
+ id=36,
114
+ title="Live Actors by Name",
115
+ description="Current number of alive actors with a particular name.",
116
+ unit="actors",
117
+ targets=[
118
+ Target(
119
+ expr='sum(ray_actors{{State!="DEAD",Source="executor",NodeAddress=~"$Instance",{global_filters}}}) by (Name)',
120
+ legend="{{Name}}",
121
+ )
122
+ ],
123
+ ),
124
+ Panel(
125
+ id=27,
126
+ title="Scheduler CPUs (logical slots)",
127
+ description="Logical CPU usage of Ray. The dotted line indicates the total number of CPUs. The logical CPU is allocated by `num_cpus` arguments from tasks and actors. PENDING means the number of CPUs that will be available when new nodes are up after the autoscaler scales up.\n\nNOTE: Ray's logical CPU is different from physical CPU usage. Ray's logical CPU is allocated by `num_cpus` arguments.",
128
+ unit="cores",
129
+ targets=[
130
+ Target(
131
+ expr='sum(ray_resources{{Name="CPU",State="USED",instance=~"$Instance",{global_filters}}}) by (instance)',
132
+ legend="CPU Usage: {{instance}}",
133
+ ),
134
+ Target(
135
+ expr='sum(ray_resources{{Name="CPU",instance=~"$Instance",{global_filters}}})',
136
+ legend="MAX",
137
+ ),
138
+ # If max + pending > max, we display this value.
139
+ # (A and predicate) means to return A when the predicate satisfies in PromSql.
140
+ Target(
141
+ expr=f"({MAX_PLUS_PENDING_CPUS} and {MAX_PLUS_PENDING_CPUS} > ({MAX_CPUS} or vector(0)))",
142
+ legend="MAX + PENDING",
143
+ ),
144
+ ],
145
+ ),
146
+ Panel(
147
+ id=29,
148
+ title="Object Store Memory",
149
+ description="Object store memory usage by location. The dotted line indicates the object store memory capacity.\n\nLocation: where the memory was allocated, which is MMAP_SHM or MMAP_DISK to indicate memory-mapped page, SPILLED to indicate spillage to disk, and WORKER_HEAP for objects small enough to be inlined in worker memory. Refer to metric_defs.cc for more information.",
150
+ unit="bytes",
151
+ targets=[
152
+ Target(
153
+ expr='sum(ray_object_store_memory{{instance=~"$Instance",{global_filters}}}) by (Location)',
154
+ legend="{{Location}}",
155
+ ),
156
+ Target(
157
+ expr='sum(ray_resources{{Name="object_store_memory",instance=~"$Instance",{global_filters}}})',
158
+ legend="MAX",
159
+ ),
160
+ ],
161
+ ),
162
+ Panel(
163
+ id=28,
164
+ title="Scheduler GPUs (logical slots)",
165
+ description="Logical GPU usage of Ray. The dotted line indicates the total number of GPUs. The logical GPU is allocated by `num_gpus` arguments from tasks and actors. PENDING means the number of GPUs that will be available when new nodes are up after the autoscaler scales up.",
166
+ unit="GPUs",
167
+ targets=[
168
+ Target(
169
+ expr='ray_resources{{Name="GPU",State="USED",instance=~"$Instance",{global_filters}}}',
170
+ legend="GPU Usage: {{instance}}",
171
+ ),
172
+ Target(
173
+ expr='sum(ray_resources{{Name="GPU",instance=~"$Instance",{global_filters}}})',
174
+ legend="MAX",
175
+ ),
176
+ # If max + pending > max, we display this value.
177
+ # (A and predicate) means to return A when the predicate satisfies in PromSql.
178
+ Target(
179
+ expr=f"({MAX_PLUS_PENDING_GPUS} and {MAX_PLUS_PENDING_GPUS} > ({MAX_GPUS} or vector(0)))",
180
+ legend="MAX + PENDING",
181
+ ),
182
+ ],
183
+ ),
184
+ Panel(
185
+ id=40,
186
+ title="Scheduler Placement Groups",
187
+ description='Note: not impacted by "Instance" variable.\n\nCurrent number of placement groups in a particular state.\n\nState: the placement group state, as described by the rpc::PlacementGroupTable proto in gcs.proto.',
188
+ unit="placement groups",
189
+ targets=[
190
+ Target(
191
+ expr="sum(ray_placement_groups{{{global_filters}}}) by (State)",
192
+ legend="{{State}}",
193
+ )
194
+ ],
195
+ ),
196
+ Panel(
197
+ id=2,
198
+ title="Node CPU (hardware utilization)",
199
+ description="",
200
+ unit="cores",
201
+ targets=[
202
+ Target(
203
+ expr='ray_node_cpu_utilization{{instance=~"$Instance", IsHeadNode="false", {global_filters}}} * ray_node_cpu_count{{instance=~"$Instance",{global_filters}}} / 100',
204
+ legend="CPU Usage: {{instance}}",
205
+ ),
206
+ Target(
207
+ expr='ray_node_cpu_utilization{{instance=~"$Instance", IsHeadNode="true", {global_filters}}} * ray_node_cpu_count{{instance=~"$Instance",{global_filters}}} / 100',
208
+ legend="CPU Usage: {{instance}} (head)",
209
+ ),
210
+ Target(
211
+ expr='sum(ray_node_cpu_count{{instance=~"$Instance",{global_filters}}})',
212
+ legend="MAX",
213
+ ),
214
+ ],
215
+ ),
216
+ Panel(
217
+ id=8,
218
+ title="Node GPU (hardware utilization)",
219
+ description="Node's physical (hardware) GPU usage. The dotted line means the total number of hardware GPUs from the cluster. ",
220
+ unit="GPUs",
221
+ targets=[
222
+ Target(
223
+ expr='ray_node_gpus_utilization{{instance=~"$Instance", IsHeadNode="false", {global_filters}}} / 100',
224
+ legend="GPU Usage: {{instance}}, gpu.{{GpuIndex}}, {{GpuDeviceName}}",
225
+ ),
226
+ Target(
227
+ expr='ray_node_gpus_utilization{{instance=~"$Instance", IsHeadNode="true", {global_filters}}} / 100',
228
+ legend="GPU Usage: {{instance}} (head), gpu.{{GpuIndex}}, {{GpuDeviceName}}",
229
+ ),
230
+ Target(
231
+ expr='sum(ray_node_gpus_available{{instance=~"$Instance",{global_filters}}})',
232
+ legend="MAX",
233
+ ),
234
+ ],
235
+ ),
236
+ Panel(
237
+ id=6,
238
+ title="Node Disk",
239
+ description="Node's physical (hardware) disk usage. The dotted line means the total amount of disk space from the cluster.\n\nNOTE: When Ray is deployed within a container, this shows the disk usage from the host machine. ",
240
+ unit="bytes",
241
+ targets=[
242
+ Target(
243
+ expr='ray_node_disk_usage{{instance=~"$Instance", IsHeadNode="false", {global_filters}}}',
244
+ legend="Disk Used: {{instance}}",
245
+ ),
246
+ Target(
247
+ expr='ray_node_disk_usage{{instance=~"$Instance", IsHeadNode="true", {global_filters}}}',
248
+ legend="Disk Used: {{instance}} (head)",
249
+ ),
250
+ Target(
251
+ expr='sum(ray_node_disk_free{{instance=~"$Instance",{global_filters}}}) + sum(ray_node_disk_usage{{instance=~"$Instance",{global_filters}}})',
252
+ legend="MAX",
253
+ ),
254
+ ],
255
+ ),
256
+ Panel(
257
+ id=32,
258
+ title="Node Disk IO Speed",
259
+ description="Disk IO per node.",
260
+ unit="Bps",
261
+ targets=[
262
+ Target(
263
+ expr='ray_node_disk_io_write_speed{{instance=~"$Instance", IsHeadNode="false", {global_filters}}}',
264
+ legend="Write: {{instance}}",
265
+ ),
266
+ Target(
267
+ expr='ray_node_disk_io_write_speed{{instance=~"$Instance", IsHeadNode="true", {global_filters}}}',
268
+ legend="Write: {{instance}} (head)",
269
+ ),
270
+ Target(
271
+ expr='ray_node_disk_io_read_speed{{instance=~"$Instance", IsHeadNode="false", {global_filters}}}',
272
+ legend="Read: {{instance}}",
273
+ ),
274
+ Target(
275
+ expr='ray_node_disk_io_read_speed{{instance=~"$Instance", IsHeadNode="true", {global_filters}}}',
276
+ legend="Read: {{instance}} (head)",
277
+ ),
278
+ ],
279
+ ),
280
+ Panel(
281
+ id=4,
282
+ title="Node Memory (heap + object store)",
283
+ description="The physical (hardware) memory usage for each node. The dotted line means the total amount of memory from the cluster. Node memory is a sum of object store memory (shared memory) and heap memory.\n\nNote: If Ray is deployed within a container, the total memory could be lower than the host machine because Ray may reserve some additional memory space outside the container.",
284
+ unit="bytes",
285
+ targets=[
286
+ Target(
287
+ expr='ray_node_mem_used{{instance=~"$Instance", IsHeadNode="false", {global_filters}}}',
288
+ legend="Memory Used: {{instance}}",
289
+ ),
290
+ Target(
291
+ expr='ray_node_mem_used{{instance=~"$Instance", IsHeadNode="true", {global_filters}}}',
292
+ legend="Memory Used: {{instance}} (head)",
293
+ ),
294
+ Target(
295
+ expr='sum(ray_node_mem_total{{instance=~"$Instance",{global_filters}}})',
296
+ legend="MAX",
297
+ ),
298
+ ],
299
+ ),
300
+ Panel(
301
+ id=48,
302
+ title="Node Memory Percentage (heap + object store)",
303
+ description="The percentage of physical (hardware) memory usage for each node.",
304
+ unit="%",
305
+ targets=[
306
+ Target(
307
+ expr='ray_node_mem_used{{instance=~"$Instance", IsHeadNode="false", {global_filters}}}/ray_node_mem_total{{instance=~"$Instance", IsHeadNode="false", {global_filters}}} * 100',
308
+ legend="Memory Used: {{instance}}",
309
+ ),
310
+ Target(
311
+ expr='ray_node_mem_used{{instance=~"$Instance", IsHeadNode="true", {global_filters}}}/ray_node_mem_total{{instance=~"$Instance", IsHeadNode="true", {global_filters}}} * 100',
312
+ legend="Memory Used: {{instance}} (head)",
313
+ ),
314
+ ],
315
+ fill=0,
316
+ stack=False,
317
+ ),
318
+ Panel(
319
+ id=44,
320
+ title="Node Out of Memory Failures by Name",
321
+ description="The number of tasks and actors killed by the Ray Out of Memory killer due to high memory pressure. Metrics are broken down by IP and the name. https://docs.ray.io/en/master/ray-core/scheduling/ray-oom-prevention.html.",
322
+ unit="failures",
323
+ targets=[
324
+ Target(
325
+ expr='ray_memory_manager_worker_eviction_total{{instance=~"$Instance",{global_filters}}}',
326
+ legend="OOM Killed: {{Name}}, {{instance}}",
327
+ ),
328
+ ],
329
+ ),
330
+ Panel(
331
+ id=34,
332
+ title="Node Memory by Component",
333
+ description="The physical (hardware) memory usage across the cluster, broken down by component. This reports the summed RSS-SHM per Ray component, which corresponds to an approximate memory usage per proc. Ray components consist of system components (e.g., raylet, gcs, dashboard, or agent) and the process (that contains method names) names of running tasks/actors.",
334
+ unit="bytes",
335
+ targets=[
336
+ Target(
337
+ expr='(sum(ray_component_rss_mb{{instance=~"$Instance",{global_filters}}} * 1e6) by (Component)) - (sum(ray_component_mem_shared_bytes{{instance=~"$Instance",{global_filters}}}) by (Component))',
338
+ legend="{{Component}}",
339
+ ),
340
+ Target(
341
+ expr='sum(ray_node_mem_shared_bytes{{instance=~"$Instance",{global_filters}}})',
342
+ legend="shared_memory",
343
+ ),
344
+ Target(
345
+ expr='sum(ray_node_mem_total{{instance=~"$Instance",{global_filters}}})',
346
+ legend="MAX",
347
+ ),
348
+ ],
349
+ ),
350
+ Panel(
351
+ id=37,
352
+ title="Node CPU by Component",
353
+ description="The physical (hardware) CPU usage across the cluster, broken down by component. This reports the summed CPU usage per Ray component. Ray components consist of system components (e.g., raylet, gcs, dashboard, or agent) and the process (that contains method names) names of running tasks/actors.",
354
+ unit="cores",
355
+ targets=[
356
+ Target(
357
+ # ray_component_cpu_percentage returns a percentage that can be > 100. It means that it uses more than 1 CPU.
358
+ expr='sum(ray_component_cpu_percentage{{instance=~"$Instance",{global_filters}}}) by (Component) / 100',
359
+ legend="{{Component}}",
360
+ ),
361
+ Target(
362
+ expr='sum(ray_node_cpu_count{{instance=~"$Instance",{global_filters}}})',
363
+ legend="MAX",
364
+ ),
365
+ ],
366
+ ),
367
+ Panel(
368
+ id=18,
369
+ title="Node GPU Memory (GRAM)",
370
+ description="The physical (hardware) GPU memory usage for each node. The dotted line means the total amount of GPU memory from the cluster.",
371
+ unit="bytes",
372
+ targets=[
373
+ Target(
374
+ expr='ray_node_gram_used{{instance=~"$Instance",{global_filters}}} * 1024 * 1024',
375
+ legend="Used GRAM: {{instance}}, gpu.{{GpuIndex}}, {{GpuDeviceName}}",
376
+ ),
377
+ Target(
378
+ expr='(sum(ray_node_gram_available{{instance=~"$Instance",{global_filters}}}) + sum(ray_node_gram_used{{instance=~"$Instance",{global_filters}}})) * 1024 * 1024',
379
+ legend="MAX",
380
+ ),
381
+ ],
382
+ ),
383
+ Panel(
384
+ id=20,
385
+ title="Node Network",
386
+ description="Network speed per node",
387
+ unit="Bps",
388
+ targets=[
389
+ Target(
390
+ expr='ray_node_network_receive_speed{{instance=~"$Instance",{global_filters}}}',
391
+ legend="Recv: {{instance}}",
392
+ ),
393
+ Target(
394
+ expr='ray_node_network_send_speed{{instance=~"$Instance",{global_filters}}}',
395
+ legend="Send: {{instance}}",
396
+ ),
397
+ ],
398
+ ),
399
+ Panel(
400
+ id=24,
401
+ title="Node Count",
402
+ description='Note: not impacted by "Instance" variable.\n\nA total number of active failed, and pending nodes from the cluster. \n\nACTIVE: A node is alive and available.\n\nFAILED: A node is dead and not available. The node is considered dead when the raylet process on the node is terminated. The node will get into the failed state if it cannot be provided (e.g., there\'s no available node from the cloud provider) or failed to setup (e.g., setup_commands have errors). \n\nPending: A node is being started by the Ray cluster launcher. The node is unavailable now because it is being provisioned and initialized.',
403
+ unit="nodes",
404
+ targets=[
405
+ Target(
406
+ expr="sum(autoscaler_active_nodes{{{global_filters}}}) by (NodeType)",
407
+ legend="Active Nodes: {{NodeType}}",
408
+ ),
409
+ Target(
410
+ expr="sum(autoscaler_recently_failed_nodes{{{global_filters}}}) by (NodeType)",
411
+ legend="Failed Nodes: {{NodeType}}",
412
+ ),
413
+ Target(
414
+ expr="sum(autoscaler_pending_nodes{{{global_filters}}}) by (NodeType)",
415
+ legend="Pending Nodes: {{NodeType}}",
416
+ ),
417
+ ],
418
+ ),
419
+ Panel(
420
+ id=41,
421
+ title="Cluster Utilization",
422
+ description="Aggregated utilization of all physical resources (CPU, GPU, memory, disk, or etc.) across the cluster.",
423
+ unit="%",
424
+ targets=[
425
+ # CPU
426
+ Target(
427
+ expr='avg(ray_node_cpu_utilization{{instance=~"$Instance",{global_filters}}})',
428
+ legend="CPU (physical)",
429
+ ),
430
+ # GPU
431
+ Target(
432
+ expr='sum(ray_node_gpus_utilization{{instance=~"$Instance",{global_filters}}}) / on() (sum(autoscaler_cluster_resources{{resource="GPU",instance=~"$Instance",{global_filters}}}) or vector(0))',
433
+ legend="GPU (physical)",
434
+ ),
435
+ # Memory
436
+ Target(
437
+ expr='sum(ray_node_mem_used{{instance=~"$Instance",{global_filters}}}) / on() (sum(ray_node_mem_total{{instance=~"$Instance",{global_filters}}})) * 100',
438
+ legend="Memory (RAM)",
439
+ ),
440
+ # GRAM
441
+ Target(
442
+ expr='sum(ray_node_gram_used{{instance=~"$Instance",{global_filters}}}) / on() (sum(ray_node_gram_available{{instance=~"$Instance",{global_filters}}}) + sum(ray_node_gram_used{{instance=~"$Instance",{global_filters}}})) * 100',
443
+ legend="GRAM",
444
+ ),
445
+ # Object Store
446
+ Target(
447
+ expr='sum(ray_object_store_memory{{instance=~"$Instance",{global_filters}}}) / on() sum(ray_resources{{Name="object_store_memory",instance=~"$Instance",{global_filters}}}) * 100',
448
+ legend="Object Store Memory",
449
+ ),
450
+ # Disk
451
+ Target(
452
+ expr='sum(ray_node_disk_usage{{instance=~"$Instance",{global_filters}}}) / on() (sum(ray_node_disk_free{{instance=~"$Instance",{global_filters}}}) + sum(ray_node_disk_usage{{instance=~"$Instance",{global_filters}}})) * 100',
453
+ legend="Disk",
454
+ ),
455
+ ],
456
+ fill=0,
457
+ stack=False,
458
+ ),
459
+ ]
460
+
461
+
462
+ ids = []
463
+ for panel in DEFAULT_GRAFANA_PANELS:
464
+ ids.append(panel.id)
465
+ assert len(ids) == len(
466
+ set(ids)
467
+ ), f"Duplicated id found. Use unique id for each panel. {ids}"
468
+
469
+ default_dashboard_config = DashboardConfig(
470
+ name="DEFAULT",
471
+ default_uid="rayDefaultDashboard",
472
+ panels=DEFAULT_GRAFANA_PANELS,
473
+ standard_global_filters=[
474
+ 'SessionName=~"$SessionName"',
475
+ 'ray_io_cluster=~"$Cluster"',
476
+ ],
477
+ base_json_file_name="default_grafana_dashboard_base.json",
478
+ )
.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/dashboards/serve_dashboard_panels.py ADDED
@@ -0,0 +1,420 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ruff: noqa: E501
2
+
3
+ from ray.dashboard.modules.metrics.dashboards.common import (
4
+ DashboardConfig,
5
+ GridPos,
6
+ Panel,
7
+ Target,
8
+ )
9
+
10
+ SERVE_GRAFANA_PANELS = [
11
+ Panel(
12
+ id=5,
13
+ title="Cluster Utilization",
14
+ description="Aggregated utilization of all physical resources (CPU, GPU, memory, disk, or etc.) across the cluster. Ignores application variable.",
15
+ unit="%",
16
+ targets=[
17
+ # CPU
18
+ Target(
19
+ expr="avg(ray_node_cpu_utilization{{{global_filters}}})",
20
+ legend="CPU (physical)",
21
+ ),
22
+ # GPU
23
+ Target(
24
+ expr="sum(ray_node_gpus_utilization{{{global_filters}}}) / on() (sum(autoscaler_cluster_resources{{resource='GPU',{global_filters}}}) or vector(0))",
25
+ legend="GPU (physical)",
26
+ ),
27
+ # Memory
28
+ Target(
29
+ expr="sum(ray_node_mem_used{{{global_filters}}}) / on() (sum(ray_node_mem_total{{{global_filters}}})) * 100",
30
+ legend="Memory (RAM)",
31
+ ),
32
+ # GRAM
33
+ Target(
34
+ expr="sum(ray_node_gram_used{{{global_filters}}}) / on() (sum(ray_node_gram_available{{{global_filters}}}) + sum(ray_node_gram_used{{{global_filters}}})) * 100",
35
+ legend="GRAM",
36
+ ),
37
+ # Object Store
38
+ Target(
39
+ expr='sum(ray_object_store_memory{{{global_filters}}}) / on() sum(ray_resources{{Name="object_store_memory",{global_filters}}}) * 100',
40
+ legend="Object Store Memory",
41
+ ),
42
+ # Disk
43
+ Target(
44
+ expr="sum(ray_node_disk_usage{{{global_filters}}}) / on() (sum(ray_node_disk_free{{{global_filters}}}) + sum(ray_node_disk_usage{{{global_filters}}})) * 100",
45
+ legend="Disk",
46
+ ),
47
+ ],
48
+ fill=0,
49
+ stack=False,
50
+ grid_pos=GridPos(0, 0, 12, 8),
51
+ ),
52
+ Panel(
53
+ id=7,
54
+ title="QPS per application",
55
+ description="QPS for each selected application.",
56
+ unit="qps",
57
+ targets=[
58
+ Target(
59
+ expr='sum(rate(ray_serve_num_http_requests_total{{application=~"$Application",application!~"",route=~"$HTTP_Route",route!~"/-/.*",{global_filters}}}[5m])) by (application, route)',
60
+ legend="{{application, route}}",
61
+ ),
62
+ Target(
63
+ expr='sum(rate(ray_serve_num_grpc_requests_total{{application=~"$Application",application!~"",method=~"$gRPC_Method",{global_filters}}}[5m])) by (application, method)',
64
+ legend="{{application, method}}",
65
+ ),
66
+ ],
67
+ grid_pos=GridPos(12, 0, 12, 8),
68
+ ),
69
+ Panel(
70
+ id=8,
71
+ title="Error QPS per application",
72
+ description="Error QPS for each selected application.",
73
+ unit="qps",
74
+ targets=[
75
+ Target(
76
+ expr='sum(rate(ray_serve_num_http_error_requests_total{{application=~"$Application",application!~"",route=~"$HTTP_Route",route!~"/-/.*",{global_filters}}}[5m])) by (application, route)',
77
+ legend="{{application, route}}",
78
+ ),
79
+ Target(
80
+ expr='sum(rate(ray_serve_num_grpc_error_requests_total{{application=~"$Application",application!~"",method=~"$gRPC_Method",{global_filters}}}[5m])) by (application, method)',
81
+ legend="{{application, method}}",
82
+ ),
83
+ ],
84
+ grid_pos=GridPos(0, 1, 12, 8),
85
+ ),
86
+ Panel(
87
+ id=17,
88
+ title="Error QPS per application per error code",
89
+ description="Error QPS for each selected application.",
90
+ unit="qps",
91
+ targets=[
92
+ Target(
93
+ expr='sum(rate(ray_serve_num_http_error_requests_total{{application=~"$Application",application!~"",route=~"$HTTP_Route",route!~"/-/.*",{global_filters}}}[5m])) by (application, route, error_code)',
94
+ legend="{{application, route, error_code}}",
95
+ ),
96
+ Target(
97
+ expr='sum(rate(ray_serve_num_grpc_error_requests_total{{application=~"$Application",application!~"",method=~"$gRPC_Method",{global_filters}}}[5m])) by (application, method, error_code)',
98
+ legend="{{application, method, error_code}}",
99
+ ),
100
+ ],
101
+ grid_pos=GridPos(12, 1, 12, 8),
102
+ ),
103
+ Panel(
104
+ id=12,
105
+ title="P50 latency per application",
106
+ description="P50 latency for selected applications.",
107
+ unit="ms",
108
+ targets=[
109
+ Target(
110
+ expr='histogram_quantile(0.5, sum(rate(ray_serve_http_request_latency_ms_bucket{{application=~"$Application",application!~"",route=~"$HTTP_Route",route!~"/-/.*",{global_filters}}}[5m])) by (application, route, le))',
111
+ legend="{{application, route}}",
112
+ ),
113
+ Target(
114
+ expr='histogram_quantile(0.5, sum(rate(ray_serve_grpc_request_latency_ms_bucket{{application=~"$Application",application!~"",method=~"$gRPC_Method",{global_filters}}}[5m])) by (application, method, le))',
115
+ legend="{{application, method}}",
116
+ ),
117
+ Target(
118
+ expr='histogram_quantile(0.5, sum(rate({{__name__=~ "ray_serve_(http|grpc)_request_latency_ms_bucket",application=~"$Application",application!~"",{global_filters}}}[5m])) by (le))',
119
+ legend="Total",
120
+ ),
121
+ ],
122
+ fill=0,
123
+ stack=False,
124
+ grid_pos=GridPos(0, 2, 8, 8),
125
+ ),
126
+ Panel(
127
+ id=15,
128
+ title="P90 latency per application",
129
+ description="P90 latency for selected applications.",
130
+ unit="ms",
131
+ targets=[
132
+ Target(
133
+ expr='histogram_quantile(0.9, sum(rate(ray_serve_http_request_latency_ms_bucket{{application=~"$Application",application!~"",route=~"$HTTP_Route",route!~"/-/.*",{global_filters}}}[5m])) by (application, route, le))',
134
+ legend="{{application, route}}",
135
+ ),
136
+ Target(
137
+ expr='histogram_quantile(0.9, sum(rate(ray_serve_grpc_request_latency_ms_bucket{{application=~"$Application",application!~"",method=~"$gRPC_Method",{global_filters}}}[5m])) by (application, method, le))',
138
+ legend="{{application, method}}",
139
+ ),
140
+ Target(
141
+ expr='histogram_quantile(0.9, sum(rate({{__name__=~ "ray_serve_(http|grpc)_request_latency_ms_bucket|ray_serve_grpc_request_latency_ms_bucket",application=~"$Application",application!~"",{global_filters}}}[5m])) by (le))',
142
+ legend="Total",
143
+ ),
144
+ ],
145
+ fill=0,
146
+ stack=False,
147
+ grid_pos=GridPos(8, 2, 8, 8),
148
+ ),
149
+ Panel(
150
+ id=16,
151
+ title="P99 latency per application",
152
+ description="P99 latency for selected applications.",
153
+ unit="ms",
154
+ targets=[
155
+ Target(
156
+ expr='histogram_quantile(0.99, sum(rate(ray_serve_http_request_latency_ms_bucket{{application=~"$Application",application!~"",route=~"$HTTP_Route",route!~"/-/.*",{global_filters}}}[5m])) by (application, route, le))',
157
+ legend="{{application, route}}",
158
+ ),
159
+ Target(
160
+ expr='histogram_quantile(0.99, sum(rate(ray_serve_grpc_request_latency_ms_bucket{{application=~"$Application",application!~"",method=~"$gRPC_Method",{global_filters}}}[5m])) by (application, method, le))',
161
+ legend="{{application, method}}",
162
+ ),
163
+ Target(
164
+ expr='histogram_quantile(0.99, sum(rate({{__name__=~ "ray_serve_(http|grpc)_request_latency_ms_bucket|ray_serve_grpc_request_latency_ms_bucket",application=~"$Application",application!~"",{global_filters}}}[5m])) by (le))',
165
+ legend="Total",
166
+ ),
167
+ ],
168
+ fill=0,
169
+ stack=False,
170
+ grid_pos=GridPos(16, 2, 8, 8),
171
+ ),
172
+ Panel(
173
+ id=2,
174
+ title="Replicas per deployment",
175
+ description='Number of replicas per deployment. Ignores "Application" variable.',
176
+ unit="replicas",
177
+ targets=[
178
+ Target(
179
+ expr="sum(ray_serve_deployment_replica_healthy{{{global_filters}}}) by (application, deployment)",
180
+ legend="{{application, deployment}}",
181
+ ),
182
+ ],
183
+ grid_pos=GridPos(0, 3, 8, 8),
184
+ ),
185
+ Panel(
186
+ id=13,
187
+ title="QPS per deployment",
188
+ description="QPS for each deployment.",
189
+ unit="qps",
190
+ targets=[
191
+ Target(
192
+ expr='sum(rate(ray_serve_deployment_request_counter_total{{application=~"$Application",application!~"",{global_filters}}}[5m])) by (application, deployment)',
193
+ legend="{{application, deployment}}",
194
+ ),
195
+ ],
196
+ grid_pos=GridPos(8, 3, 8, 8),
197
+ ),
198
+ Panel(
199
+ id=14,
200
+ title="Error QPS per deployment",
201
+ description="Error QPS for each deplyoment.",
202
+ unit="qps",
203
+ targets=[
204
+ Target(
205
+ expr='sum(rate(ray_serve_deployment_error_counter_total{{application=~"$Application",application!~"",{global_filters}}}[5m])) by (application, deployment)',
206
+ legend="{{application, deployment}}",
207
+ ),
208
+ ],
209
+ grid_pos=GridPos(16, 3, 8, 8),
210
+ ),
211
+ Panel(
212
+ id=9,
213
+ title="P50 latency per deployment",
214
+ description="P50 latency per deployment.",
215
+ unit="ms",
216
+ targets=[
217
+ Target(
218
+ expr='histogram_quantile(0.5, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{{application=~"$Application",application!~"",{global_filters}}}[5m])) by (application, deployment, le))',
219
+ legend="{{application, deployment}}",
220
+ ),
221
+ Target(
222
+ expr='histogram_quantile(0.5, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{{application=~"$Application",application!~"",{global_filters}}}[5m])) by (le))',
223
+ legend="Total",
224
+ ),
225
+ ],
226
+ fill=0,
227
+ stack=False,
228
+ grid_pos=GridPos(0, 4, 8, 8),
229
+ ),
230
+ Panel(
231
+ id=10,
232
+ title="P90 latency per deployment",
233
+ description="P90 latency per deployment.",
234
+ unit="ms",
235
+ targets=[
236
+ Target(
237
+ expr='histogram_quantile(0.9, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{{application=~"$Application",application!~"",{global_filters}}}[5m])) by (application, deployment, le))',
238
+ legend="{{application, deployment}}",
239
+ ),
240
+ Target(
241
+ expr='histogram_quantile(0.9, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{{application=~"$Application",application!~"",{global_filters}}}[5m])) by (le))',
242
+ legend="Total",
243
+ ),
244
+ ],
245
+ fill=0,
246
+ stack=False,
247
+ grid_pos=GridPos(8, 4, 8, 8),
248
+ ),
249
+ Panel(
250
+ id=11,
251
+ title="P99 latency per deployment",
252
+ description="P99 latency per deployment.",
253
+ unit="ms",
254
+ targets=[
255
+ Target(
256
+ expr='histogram_quantile(0.99, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{{application=~"$Application",application!~"",{global_filters}}}[5m])) by (application, deployment, le))',
257
+ legend="{{application, deployment}}",
258
+ ),
259
+ Target(
260
+ expr='histogram_quantile(0.99, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{{application=~"$Application",application!~"",{global_filters}}}[5m])) by (le))',
261
+ legend="Total",
262
+ ),
263
+ ],
264
+ fill=0,
265
+ stack=False,
266
+ grid_pos=GridPos(16, 4, 8, 8),
267
+ ),
268
+ Panel(
269
+ id=3,
270
+ title="Queue size per deployment",
271
+ description='Number of requests queued per deployment. Ignores "Application" variable.',
272
+ unit="requests",
273
+ targets=[
274
+ Target(
275
+ expr="sum(ray_serve_deployment_queued_queries{{{global_filters}}}) by (application, deployment)",
276
+ legend="{{application, deployment}}",
277
+ ),
278
+ ],
279
+ fill=0,
280
+ stack=False,
281
+ grid_pos=GridPos(0, 5, 8, 8),
282
+ ),
283
+ Panel(
284
+ id=4,
285
+ title="Node count",
286
+ description='Number of nodes in this cluster. Ignores "Application" variable.',
287
+ unit="nodes",
288
+ targets=[
289
+ # TODO(aguo): Update this to use autoscaler metrics instead
290
+ Target(
291
+ expr="sum(autoscaler_active_nodes{{{global_filters}}}) by (NodeType)",
292
+ legend="Active Nodes: {{NodeType}}",
293
+ ),
294
+ Target(
295
+ expr="sum(autoscaler_recently_failed_nodes{{{global_filters}}}) by (NodeType)",
296
+ legend="Failed Nodes: {{NodeType}}",
297
+ ),
298
+ Target(
299
+ expr="sum(autoscaler_pending_nodes{{{global_filters}}}) by (NodeType)",
300
+ legend="Pending Nodes: {{NodeType}}",
301
+ ),
302
+ ],
303
+ grid_pos=GridPos(8, 5, 8, 8),
304
+ ),
305
+ Panel(
306
+ id=6,
307
+ title="Node network",
308
+ description='Network speed per node. Ignores "Application" variable.',
309
+ unit="Bps",
310
+ targets=[
311
+ Target(
312
+ expr="sum(ray_node_network_receive_speed{{{global_filters}}}) by (instance)",
313
+ legend="Recv: {{instance}}",
314
+ ),
315
+ Target(
316
+ expr="sum(ray_node_network_send_speed{{{global_filters}}}) by (instance)",
317
+ legend="Send: {{instance}}",
318
+ ),
319
+ ],
320
+ fill=1,
321
+ linewidth=2,
322
+ stack=False,
323
+ grid_pos=GridPos(16, 5, 8, 8),
324
+ ),
325
+ Panel(
326
+ id=20,
327
+ title="Ongoing HTTP Requests",
328
+ description="The number of ongoing requests in the HTTP Proxy.",
329
+ unit="requests",
330
+ targets=[
331
+ Target(
332
+ expr="ray_serve_num_ongoing_http_requests{{{global_filters}}}",
333
+ legend="Ongoing HTTP Requests",
334
+ ),
335
+ ],
336
+ grid_pos=GridPos(0, 6, 8, 8),
337
+ ),
338
+ Panel(
339
+ id=21,
340
+ title="Ongoing gRPC Requests",
341
+ description="The number of ongoing requests in the gRPC Proxy.",
342
+ unit="requests",
343
+ targets=[
344
+ Target(
345
+ expr="ray_serve_num_ongoing_grpc_requests{{{global_filters}}}",
346
+ legend="Ongoing gRPC Requests",
347
+ ),
348
+ ],
349
+ grid_pos=GridPos(8, 6, 8, 8),
350
+ ),
351
+ Panel(
352
+ id=22,
353
+ title="Scheduling Tasks",
354
+ description="The number of request scheduling tasks in the router.",
355
+ unit="tasks",
356
+ targets=[
357
+ Target(
358
+ expr="ray_serve_num_scheduling_tasks{{{global_filters}}}",
359
+ legend="Scheduling Tasks",
360
+ ),
361
+ ],
362
+ grid_pos=GridPos(16, 6, 8, 8),
363
+ ),
364
+ Panel(
365
+ id=23,
366
+ title="Scheduling Tasks in Backoff",
367
+ description="The number of request scheduling tasks in the router that are undergoing backoff.",
368
+ unit="tasks",
369
+ targets=[
370
+ Target(
371
+ expr="ray_serve_num_scheduling_tasks_in_backoff{{{global_filters}}}",
372
+ legend="Scheduling Tasks in Backoff",
373
+ ),
374
+ ],
375
+ grid_pos=GridPos(0, 7, 8, 8),
376
+ ),
377
+ Panel(
378
+ id=24,
379
+ title="Controller Control Loop Duration",
380
+ description="The duration of the last control loop.",
381
+ unit="seconds",
382
+ targets=[
383
+ Target(
384
+ expr="ray_serve_controller_control_loop_duration_s{{{global_filters}}}",
385
+ legend="Control Loop Duration",
386
+ ),
387
+ ],
388
+ grid_pos=GridPos(8, 7, 8, 8),
389
+ ),
390
+ Panel(
391
+ id=25,
392
+ title="Number of Control Loops",
393
+ description="The number of control loops performed by the controller. Increases monotonically over the controller's lifetime.",
394
+ unit="loops",
395
+ targets=[
396
+ Target(
397
+ expr="ray_serve_controller_num_control_loops{{{global_filters}}}",
398
+ legend="Control Loops",
399
+ ),
400
+ ],
401
+ grid_pos=GridPos(16, 7, 8, 8),
402
+ ),
403
+ ]
404
+
405
+ ids = []
406
+ for panel in SERVE_GRAFANA_PANELS:
407
+ ids.append(panel.id)
408
+ assert len(ids) == len(
409
+ set(ids)
410
+ ), f"Duplicated id found. Use unique id for each panel. {ids}"
411
+
412
+ serve_dashboard_config = DashboardConfig(
413
+ name="SERVE",
414
+ default_uid="rayServeDashboard",
415
+ panels=SERVE_GRAFANA_PANELS,
416
+ standard_global_filters=[
417
+ 'ray_io_cluster=~"$Cluster"',
418
+ ],
419
+ base_json_file_name="serve_grafana_dashboard_base.json",
420
+ )
.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/dashboards/serve_deployment_dashboard_panels.py ADDED
@@ -0,0 +1,259 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ruff: noqa: E501
2
+
3
+ from ray.dashboard.modules.metrics.dashboards.common import (
4
+ DashboardConfig,
5
+ GridPos,
6
+ Panel,
7
+ Target,
8
+ )
9
+
10
+ SERVE_DEPLOYMENT_GRAFANA_PANELS = [
11
+ Panel(
12
+ id=1,
13
+ title="Replicas per deployment",
14
+ description='Number of replicas per deployment. Ignores "Route" variable.',
15
+ unit="replicas",
16
+ targets=[
17
+ Target(
18
+ expr="sum(ray_serve_deployment_replica_healthy{{{global_filters}}}) by (application, deployment)",
19
+ legend="{{application, deployment}}",
20
+ ),
21
+ ],
22
+ grid_pos=GridPos(0, 0, 8, 8),
23
+ ),
24
+ Panel(
25
+ id=2,
26
+ title="QPS per replica",
27
+ description="QPS for each replica.",
28
+ unit="qps",
29
+ targets=[
30
+ Target(
31
+ expr='sum(rate(ray_serve_deployment_request_counter_total{{route=~"$Route",route!~"/-/.*",{global_filters}}}[5m])) by (application, deployment, replica)',
32
+ legend="{{replica}}",
33
+ ),
34
+ ],
35
+ grid_pos=GridPos(8, 0, 8, 8),
36
+ ),
37
+ Panel(
38
+ id=3,
39
+ title="Error QPS per replica",
40
+ description="Error QPS for each replica.",
41
+ unit="qps",
42
+ targets=[
43
+ Target(
44
+ expr='sum(rate(ray_serve_deployment_error_counter_total{{route=~"$Route",route!~"/-/.*",{global_filters}}}[5m])) by (application, deployment, replica)',
45
+ legend="{{replica}}",
46
+ ),
47
+ ],
48
+ grid_pos=GridPos(16, 0, 8, 8),
49
+ ),
50
+ Panel(
51
+ id=4,
52
+ title="P50 latency per replica",
53
+ description="P50 latency per replica.",
54
+ unit="ms",
55
+ targets=[
56
+ Target(
57
+ expr='histogram_quantile(0.5, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{{route=~"$Route",route!~"/-/.*",{global_filters}}}[5m])) by (application, deployment, replica, le))',
58
+ legend="{{replica}}",
59
+ ),
60
+ Target(
61
+ expr='histogram_quantile(0.5, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{{route=~"$Route",route!~"/-/.*",{global_filters}}}[5m])) by (le))',
62
+ legend="Total",
63
+ ),
64
+ ],
65
+ fill=0,
66
+ stack=False,
67
+ grid_pos=GridPos(0, 1, 8, 8),
68
+ ),
69
+ Panel(
70
+ id=5,
71
+ title="P90 latency per replica",
72
+ description="P90 latency per replica.",
73
+ unit="ms",
74
+ targets=[
75
+ Target(
76
+ expr='histogram_quantile(0.9, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{{route=~"$Route",route!~"/-/.*",{global_filters}}}[5m])) by (application, deployment, replica, le))',
77
+ legend="{{replica}}",
78
+ ),
79
+ Target(
80
+ expr='histogram_quantile(0.9, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{{route=~"$Route",route!~"/-/.*",{global_filters}}}[5m])) by (le))',
81
+ legend="Total",
82
+ ),
83
+ ],
84
+ fill=0,
85
+ stack=False,
86
+ grid_pos=GridPos(8, 1, 8, 8),
87
+ ),
88
+ Panel(
89
+ id=6,
90
+ title="P99 latency per replica",
91
+ description="P99 latency per replica.",
92
+ unit="ms",
93
+ targets=[
94
+ Target(
95
+ expr='histogram_quantile(0.99, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{{route=~"$Route",route!~"/-/.*",{global_filters}}}[5m])) by (application, deployment, replica, le))',
96
+ legend="{{replica}}",
97
+ ),
98
+ Target(
99
+ expr='histogram_quantile(0.99, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{{route=~"$Route",{global_filters}}}[5m])) by (le))',
100
+ legend="Total",
101
+ ),
102
+ ],
103
+ fill=0,
104
+ stack=False,
105
+ grid_pos=GridPos(16, 1, 8, 8),
106
+ ),
107
+ Panel(
108
+ id=7,
109
+ title="Queue size per deployment",
110
+ description='Number of requests queued per deployment. Ignores "Replica" and "Route" variable.',
111
+ unit="requests",
112
+ targets=[
113
+ Target(
114
+ expr="sum(ray_serve_deployment_queued_queries{{{global_filters}}}) by (application, deployment)",
115
+ legend="{{application, deployment}}",
116
+ ),
117
+ ],
118
+ fill=0,
119
+ stack=False,
120
+ grid_pos=GridPos(0, 2, 12, 8),
121
+ ),
122
+ Panel(
123
+ id=8,
124
+ title="Running requests per replica",
125
+ description="Current running requests for each replica.",
126
+ unit="requests",
127
+ targets=[
128
+ Target(
129
+ expr="sum(ray_serve_replica_processing_queries{{{global_filters}}}) by (application, deployment, replica)",
130
+ legend="{{replica}}",
131
+ ),
132
+ ],
133
+ fill=0,
134
+ stack=False,
135
+ grid_pos=GridPos(12, 2, 12, 8),
136
+ ),
137
+ Panel(
138
+ id=9,
139
+ title="Multiplexed models per replica",
140
+ description="The number of multiplexed models for each replica.",
141
+ unit="models",
142
+ targets=[
143
+ Target(
144
+ expr="sum(ray_serve_num_multiplexed_models{{{global_filters}}}) by (application, deployment, replica)",
145
+ legend="{{replica}}",
146
+ ),
147
+ ],
148
+ fill=0,
149
+ stack=False,
150
+ grid_pos=GridPos(0, 3, 8, 8),
151
+ ),
152
+ Panel(
153
+ id=10,
154
+ title="Multiplexed model loads per replica",
155
+ description="The number of times of multiplexed models loaded for each replica.",
156
+ unit="times",
157
+ targets=[
158
+ Target(
159
+ expr="sum(ray_serve_multiplexed_models_load_counter_total{{{global_filters}}}) by (application, deployment, replica)",
160
+ legend="{{replica}}",
161
+ ),
162
+ ],
163
+ fill=0,
164
+ stack=False,
165
+ grid_pos=GridPos(8, 3, 8, 8),
166
+ ),
167
+ Panel(
168
+ id=11,
169
+ title="Multiplexed model unloads per replica",
170
+ description="The number of times of multiplexed models unloaded for each replica.",
171
+ unit="times",
172
+ targets=[
173
+ Target(
174
+ expr="sum(ray_serve_multiplexed_models_unload_counter_total{{{global_filters}}}) by (application, deployment, replica)",
175
+ legend="{{replica}}",
176
+ ),
177
+ ],
178
+ fill=0,
179
+ stack=False,
180
+ grid_pos=GridPos(16, 3, 8, 8),
181
+ ),
182
+ Panel(
183
+ id=12,
184
+ title="P99 latency of multiplexed model loads per replica",
185
+ description="P99 latency of mutliplexed model load per replica.",
186
+ unit="ms",
187
+ targets=[
188
+ Target(
189
+ expr="histogram_quantile(0.99, sum(rate(ray_serve_multiplexed_model_load_latency_ms_bucket{{{global_filters}}}[5m])) by (application, deployment, replica, le))",
190
+ legend="{{replica}}",
191
+ ),
192
+ ],
193
+ fill=0,
194
+ stack=False,
195
+ grid_pos=GridPos(0, 4, 8, 8),
196
+ ),
197
+ Panel(
198
+ id=13,
199
+ title="P99 latency of multiplexed model unloads per replica",
200
+ description="P99 latency of mutliplexed model unload per replica.",
201
+ unit="ms",
202
+ targets=[
203
+ Target(
204
+ expr="histogram_quantile(0.99, sum(rate(ray_serve_multiplexed_model_unload_latency_ms_bucket{{{global_filters}}}[5m])) by (application, deployment, replica, le))",
205
+ legend="{{replica}}",
206
+ ),
207
+ ],
208
+ fill=0,
209
+ stack=False,
210
+ grid_pos=GridPos(8, 4, 8, 8),
211
+ ),
212
+ Panel(
213
+ id=14,
214
+ title="Multiplexed model ids per replica",
215
+ description="The ids of multiplexed models for each replica.",
216
+ unit="model",
217
+ targets=[
218
+ Target(
219
+ expr="ray_serve_registered_multiplexed_model_id{{{global_filters}}}",
220
+ legend="{{replica}}:{{model_id}}",
221
+ ),
222
+ ],
223
+ grid_pos=GridPos(16, 4, 8, 8),
224
+ stack=False,
225
+ ),
226
+ Panel(
227
+ id=15,
228
+ title="Multiplexed model cache hit rate",
229
+ description="The cache hit rate of multiplexed models for the deployment.",
230
+ unit="%",
231
+ targets=[
232
+ Target(
233
+ expr="(1 - sum(rate(ray_serve_multiplexed_models_load_counter_total{{{global_filters}}}[5m]))/sum(rate(ray_serve_multiplexed_get_model_requests_counter_total{{{global_filters}}}[5m])))",
234
+ legend="{{replica}}",
235
+ ),
236
+ ],
237
+ grid_pos=GridPos(0, 5, 8, 8),
238
+ ),
239
+ ]
240
+
241
+ ids = []
242
+ for panel in SERVE_DEPLOYMENT_GRAFANA_PANELS:
243
+ ids.append(panel.id)
244
+ assert len(ids) == len(
245
+ set(ids)
246
+ ), f"Duplicated id found. Use unique id for each panel. {ids}"
247
+
248
+ serve_deployment_dashboard_config = DashboardConfig(
249
+ name="SERVE_DEPLOYMENT",
250
+ default_uid="rayServeDeploymentDashboard",
251
+ panels=SERVE_DEPLOYMENT_GRAFANA_PANELS,
252
+ standard_global_filters=[
253
+ 'application=~"$Application"',
254
+ 'deployment=~"$Deployment"',
255
+ 'replica=~"$Replica"',
256
+ 'ray_io_cluster=~"$Cluster"',
257
+ ],
258
+ base_json_file_name="serve_deployment_grafana_dashboard_base.json",
259
+ )
.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/dashboards/serve_deployment_grafana_dashboard_base.json ADDED
@@ -0,0 +1,223 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "annotations": {
3
+ "list": [
4
+ {
5
+ "builtIn": 1,
6
+ "datasource": "-- Grafana --",
7
+ "enable": true,
8
+ "hide": true,
9
+ "iconColor": "rgba(0, 211, 255, 1)",
10
+ "name": "Annotations & Alerts",
11
+ "type": "dashboard"
12
+ }
13
+ ]
14
+ },
15
+ "editable": true,
16
+ "gnetId": null,
17
+ "graphTooltip": 0,
18
+ "iteration": 1667344411089,
19
+ "links": [],
20
+ "panels": [],
21
+ "refresh": false,
22
+ "schemaVersion": 27,
23
+ "style": "dark",
24
+ "tags": [],
25
+ "templating": {
26
+ "list": [
27
+ {
28
+ "current": {
29
+ "selected": false
30
+ },
31
+ "description": "Filter queries to specific prometheus type.",
32
+ "hide": 2,
33
+ "includeAll": false,
34
+ "multi": false,
35
+ "name": "datasource",
36
+ "options": [],
37
+ "query": "prometheus",
38
+ "refresh": 1,
39
+ "regex": "",
40
+ "skipUrlSync": false,
41
+ "type": "datasource"
42
+ },
43
+ {
44
+ "allValue": ".*",
45
+ "current": {
46
+ "selected": true,
47
+ "text": [
48
+ "All"
49
+ ],
50
+ "value": [
51
+ "$__all"
52
+ ]
53
+ },
54
+ "datasource": "${datasource}",
55
+ "definition": "label_values(ray_serve_deployment_replica_healthy{{{global_filters}}}, application)",
56
+ "description": null,
57
+ "error": null,
58
+ "hide": 0,
59
+ "includeAll": true,
60
+ "label": null,
61
+ "multi": true,
62
+ "name": "Application",
63
+ "options": [],
64
+ "query": {
65
+ "query": "label_values(ray_serve_deployment_replica_healthy{{{global_filters}}}, application)",
66
+ "refId": "Prometheus-Instance-Variable-Query"
67
+ },
68
+ "refresh": 2,
69
+ "regex": "",
70
+ "skipUrlSync": false,
71
+ "sort": 0,
72
+ "tagValuesQuery": "",
73
+ "tags": [],
74
+ "tagsQuery": "",
75
+ "type": "query",
76
+ "useTags": false
77
+ },
78
+ {
79
+ "allValue": ".*",
80
+ "current": {
81
+ "selected": true,
82
+ "text": [
83
+ "All"
84
+ ],
85
+ "value": [
86
+ "$__all"
87
+ ]
88
+ },
89
+ "datasource": "${datasource}",
90
+ "definition": "label_values(ray_serve_deployment_replica_healthy{{application=~\"$Application\",{global_filters}}}, deployment)",
91
+ "description": null,
92
+ "error": null,
93
+ "hide": 0,
94
+ "includeAll": true,
95
+ "label": null,
96
+ "multi": true,
97
+ "name": "Deployment",
98
+ "options": [],
99
+ "query": {
100
+ "query": "label_values(ray_serve_deployment_replica_healthy{{application=~\"$Application\",{global_filters}}}, deployment)",
101
+ "refId": "Prometheus-Instance-Variable-Query"
102
+ },
103
+ "refresh": 2,
104
+ "regex": "",
105
+ "skipUrlSync": false,
106
+ "sort": 0,
107
+ "tagValuesQuery": "",
108
+ "tags": [],
109
+ "tagsQuery": "",
110
+ "type": "query",
111
+ "useTags": false
112
+ },
113
+ {
114
+ "allValue": ".*",
115
+ "current": {
116
+ "selected": true,
117
+ "text": [
118
+ "All"
119
+ ],
120
+ "value": [
121
+ "$__all"
122
+ ]
123
+ },
124
+ "datasource": "${datasource}",
125
+ "definition": "label_values(ray_serve_deployment_replica_healthy{{application=~\"$Application\",deployment=~\"$Deployment\",{global_filters}}}, replica)",
126
+ "description": null,
127
+ "error": null,
128
+ "hide": 0,
129
+ "includeAll": true,
130
+ "label": null,
131
+ "multi": true,
132
+ "name": "Replica",
133
+ "options": [],
134
+ "query": {
135
+ "query": "label_values(ray_serve_deployment_replica_healthy{{application=~\"$Application\",deployment=~\"$Deployment\",{global_filters}}}, replica)",
136
+ "refId": "Prometheus-Instance-Variable-Query"
137
+ },
138
+ "refresh": 2,
139
+ "regex": "",
140
+ "skipUrlSync": false,
141
+ "sort": 0,
142
+ "tagValuesQuery": "",
143
+ "tags": [],
144
+ "tagsQuery": "",
145
+ "type": "query",
146
+ "useTags": false
147
+ },
148
+ {
149
+ "allValue": ".*",
150
+ "current": {
151
+ "selected": true,
152
+ "text": [
153
+ "All"
154
+ ],
155
+ "value": [
156
+ "$__all"
157
+ ]
158
+ },
159
+ "datasource": "${datasource}",
160
+ "definition": "label_values(ray_serve_deployment_request_counter{{deployment=~\"$Deployment\",{global_filters}}}, route)",
161
+ "description": null,
162
+ "error": null,
163
+ "hide": 0,
164
+ "includeAll": true,
165
+ "label": null,
166
+ "multi": true,
167
+ "name": "Route",
168
+ "options": [],
169
+ "query": {
170
+ "query": "label_values(ray_serve_deployment_request_counter{{deployment=~\"$Deployment\",{global_filters}}}, route)",
171
+ "refId": "Prometheus-Instance-Variable-Query"
172
+ },
173
+ "refresh": 2,
174
+ "regex": "",
175
+ "skipUrlSync": false,
176
+ "sort": 0,
177
+ "tagValuesQuery": "",
178
+ "tags": [],
179
+ "tagsQuery": "",
180
+ "type": "query",
181
+ "useTags": false
182
+ },
183
+ {
184
+ "current": {
185
+ "selected": false
186
+ },
187
+ "datasource": "${datasource}",
188
+ "definition": "label_values(ray_node_network_receive_speed{{{global_filters}}}, ray_io_cluster)",
189
+ "description": "Filter queries to specific Ray clusters for KubeRay. When ingesting metrics across multiple ray clusters, the ray_io_cluster label should be set per cluster. For KubeRay users, this is done automaticaly with Prometheus PodMonitor.",
190
+ "error": null,
191
+ "hide": 0,
192
+ "includeAll": false,
193
+ "label": null,
194
+ "multi": false,
195
+ "name": "Cluster",
196
+ "options": [],
197
+ "query": {
198
+ "query": "label_values(ray_node_network_receive_speed{{{global_filters}}}, ray_io_cluster)",
199
+ "refId": "StandardVariableQuery"
200
+ },
201
+ "refresh": 2,
202
+ "regex": "",
203
+ "skipUrlSync": false,
204
+ "sort": 2,
205
+ "tagValuesQuery": "",
206
+ "tags": [],
207
+ "tagsQuery": "",
208
+ "type": "query",
209
+ "useTags": false
210
+ }
211
+ ]
212
+ },
213
+ "rayMeta": ["excludesSystemRoutes"],
214
+ "time": {
215
+ "from": "now-30m",
216
+ "to": "now"
217
+ },
218
+ "timepicker": {},
219
+ "timezone": "",
220
+ "title": "Serve Deployment Dashboard",
221
+ "uid": "rayServeDeploymentDashboard",
222
+ "version": 1
223
+ }
.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/dashboards/serve_grafana_dashboard_base.json ADDED
@@ -0,0 +1,188 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "annotations": {
3
+ "list": [
4
+ {
5
+ "builtIn": 1,
6
+ "datasource": "-- Grafana --",
7
+ "enable": true,
8
+ "hide": true,
9
+ "iconColor": "rgba(0, 211, 255, 1)",
10
+ "name": "Annotations & Alerts",
11
+ "type": "dashboard"
12
+ }
13
+ ]
14
+ },
15
+ "editable": true,
16
+ "gnetId": null,
17
+ "graphTooltip": 0,
18
+ "iteration": 1667344411089,
19
+ "links": [],
20
+ "panels": [],
21
+ "refresh": false,
22
+ "schemaVersion": 27,
23
+ "style": "dark",
24
+ "tags": [],
25
+ "templating": {
26
+ "list": [
27
+ {
28
+ "current": {
29
+ "selected": false
30
+ },
31
+ "description": "Filter queries of a specific Prometheus type.",
32
+ "hide": 2,
33
+ "includeAll": false,
34
+ "multi": false,
35
+ "name": "datasource",
36
+ "options": [],
37
+ "query": "prometheus",
38
+ "refresh": 1,
39
+ "regex": "",
40
+ "skipUrlSync": false,
41
+ "type": "datasource"
42
+ },
43
+ {
44
+ "allValue": ".*",
45
+ "current": {
46
+ "selected": true,
47
+ "text": [
48
+ "All"
49
+ ],
50
+ "value": [
51
+ "$__all"
52
+ ]
53
+ },
54
+ "datasource": "${datasource}",
55
+ "definition": "label_values(ray_serve_deployment_replica_healthy{{{global_filters}}}, application)",
56
+ "description": null,
57
+ "error": null,
58
+ "hide": 0,
59
+ "includeAll": true,
60
+ "label": null,
61
+ "multi": true,
62
+ "name": "Application",
63
+ "options": [],
64
+ "query": {
65
+ "query": "label_values(ray_serve_deployment_replica_healthy{{{global_filters}}}, application)",
66
+ "refId": "Prometheus-Instance-Variable-Query"
67
+ },
68
+ "refresh": 2,
69
+ "regex": "",
70
+ "skipUrlSync": false,
71
+ "sort": 0,
72
+ "tagValuesQuery": "",
73
+ "tags": [],
74
+ "tagsQuery": "",
75
+ "type": "query",
76
+ "useTags": false
77
+ },
78
+ {
79
+ "allValue": ".*",
80
+ "current": {
81
+ "selected": true,
82
+ "text": [
83
+ "All"
84
+ ],
85
+ "value": [
86
+ "$__all"
87
+ ]
88
+ },
89
+ "datasource": "${datasource}",
90
+ "definition": "label_values(ray_serve_num_http_requests_total{{{global_filters}}}, route)",
91
+ "description": null,
92
+ "error": null,
93
+ "hide": 0,
94
+ "includeAll": true,
95
+ "label": "HTTP Route",
96
+ "multi": true,
97
+ "name": "HTTP_Route",
98
+ "options": [],
99
+ "query": {
100
+ "query": "label_values(ray_serve_num_http_requests_total{{{global_filters}}}, route)",
101
+ "refId": "Prometheus-Instance-Variable-Query"
102
+ },
103
+ "refresh": 2,
104
+ "regex": "",
105
+ "skipUrlSync": false,
106
+ "sort": 0,
107
+ "tagValuesQuery": "",
108
+ "tags": [],
109
+ "tagsQuery": "",
110
+ "type": "query",
111
+ "useTags": false
112
+ },
113
+ {
114
+ "allValue": ".*",
115
+ "current": {
116
+ "selected": true,
117
+ "text": [
118
+ "All"
119
+ ],
120
+ "value": [
121
+ "$__all"
122
+ ]
123
+ },
124
+ "datasource": "${datasource}",
125
+ "definition": "label_values(ray_serve_num_grpc_requests{{{global_filters}}}, method)",
126
+ "description": null,
127
+ "error": null,
128
+ "hide": 0,
129
+ "includeAll": true,
130
+ "label": "gRPC Service Method",
131
+ "multi": true,
132
+ "name": "gRPC_Method",
133
+ "options": [],
134
+ "query": {
135
+ "query": "label_values(ray_serve_num_grpc_requests{{{global_filters}}}, method)",
136
+ "refId": "Prometheus-Instance-Variable-Query"
137
+ },
138
+ "refresh": 2,
139
+ "regex": "",
140
+ "skipUrlSync": false,
141
+ "sort": 0,
142
+ "tagValuesQuery": "",
143
+ "tags": [],
144
+ "tagsQuery": "",
145
+ "type": "query",
146
+ "useTags": false
147
+ },
148
+ {
149
+ "current": {
150
+ "selected": false
151
+ },
152
+ "datasource": "${datasource}",
153
+ "definition": "label_values(ray_node_network_receive_speed{{{global_filters}}}, ray_io_cluster)",
154
+ "description": "Filter queries to specific Ray clusters for KubeRay. When ingesting metrics across multiple ray clusters, the ray_io_cluster label should be set per cluster. For KubeRay users, this is done automaticaly with Prometheus PodMonitor.",
155
+ "error": null,
156
+ "hide": 0,
157
+ "includeAll": false,
158
+ "label": null,
159
+ "multi": false,
160
+ "name": "Cluster",
161
+ "options": [],
162
+ "query": {
163
+ "query": "label_values(ray_node_network_receive_speed{{{global_filters}}}, ray_io_cluster)",
164
+ "refId": "StandardVariableQuery"
165
+ },
166
+ "refresh": 2,
167
+ "regex": "",
168
+ "skipUrlSync": false,
169
+ "sort": 2,
170
+ "tagValuesQuery": "",
171
+ "tags": [],
172
+ "tagsQuery": "",
173
+ "type": "query",
174
+ "useTags": false
175
+ }
176
+ ]
177
+ },
178
+ "rayMeta": ["excludesSystemRoutes"],
179
+ "time": {
180
+ "from": "now-30m",
181
+ "to": "now"
182
+ },
183
+ "timepicker": {},
184
+ "timezone": "",
185
+ "title": "Serve Dashboard",
186
+ "uid": "rayServeDashboard",
187
+ "version": 1
188
+ }
.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/export/prometheus/prometheus.yml ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # my global config
2
+ global:
3
+ scrape_interval: 10s # Set the scrape interval to every 10 seconds. Default is every 1 minute.
4
+ evaluation_interval: 10s # Evaluate rules every 10 seconds. The default is every 1 minute.
5
+ # scrape_timeout is set to the global default (10s).
6
+
7
+ scrape_configs:
8
+ # Scrape from each Ray node as defined in the service_discovery.json provided by Ray.
9
+ - job_name: 'ray'
10
+ file_sd_configs:
11
+ - files:
12
+ - '/tmp/ray/prom_metrics_service_discovery.json'
.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/grafana_dashboard_factory.py ADDED
@@ -0,0 +1,301 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import copy
2
+ import json
3
+ import os
4
+ from dataclasses import asdict
5
+ from typing import List, Tuple
6
+
7
+ import ray
8
+ from ray.dashboard.modules.metrics.dashboards.common import DashboardConfig, Panel
9
+ from ray.dashboard.modules.metrics.dashboards.data_dashboard_panels import (
10
+ data_dashboard_config,
11
+ )
12
+ from ray.dashboard.modules.metrics.dashboards.default_dashboard_panels import (
13
+ default_dashboard_config,
14
+ )
15
+ from ray.dashboard.modules.metrics.dashboards.serve_dashboard_panels import (
16
+ serve_dashboard_config,
17
+ )
18
+ from ray.dashboard.modules.metrics.dashboards.serve_deployment_dashboard_panels import (
19
+ serve_deployment_dashboard_config,
20
+ )
21
+
22
+ GRAFANA_DASHBOARD_UID_OVERRIDE_ENV_VAR_TEMPLATE = "RAY_GRAFANA_{name}_DASHBOARD_UID"
23
+ GRAFANA_DASHBOARD_GLOBAL_FILTERS_OVERRIDE_ENV_VAR_TEMPLATE = (
24
+ "RAY_GRAFANA_{name}_DASHBOARD_GLOBAL_FILTERS"
25
+ )
26
+
27
+ TARGET_TEMPLATE = {
28
+ "exemplar": True,
29
+ "expr": "0",
30
+ "interval": "",
31
+ "legendFormat": "",
32
+ "queryType": "randomWalk",
33
+ "refId": "A",
34
+ }
35
+
36
+
37
+ PANEL_TEMPLATE = {
38
+ "aliasColors": {},
39
+ "bars": False,
40
+ "dashLength": 10,
41
+ "dashes": False,
42
+ "datasource": r"${datasource}",
43
+ "description": "<Description>",
44
+ "fieldConfig": {"defaults": {}, "overrides": []},
45
+ "fill": 10,
46
+ "fillGradient": 0,
47
+ "gridPos": {"h": 8, "w": 12, "x": 0, "y": 0},
48
+ "hiddenSeries": False,
49
+ "id": 26,
50
+ "legend": {
51
+ "alignAsTable": True,
52
+ "avg": False,
53
+ "current": True,
54
+ "hideEmpty": False,
55
+ "hideZero": True,
56
+ "max": False,
57
+ "min": False,
58
+ "rightSide": False,
59
+ "show": True,
60
+ "sort": "current",
61
+ "sortDesc": True,
62
+ "total": False,
63
+ "values": True,
64
+ },
65
+ "lines": True,
66
+ "linewidth": 1,
67
+ "nullPointMode": "null",
68
+ "options": {"alertThreshold": True},
69
+ "percentage": False,
70
+ "pluginVersion": "7.5.17",
71
+ "pointradius": 2,
72
+ "points": False,
73
+ "renderer": "flot",
74
+ "seriesOverrides": [
75
+ {
76
+ "$$hashKey": "object:2987",
77
+ "alias": "MAX",
78
+ "dashes": True,
79
+ "color": "#1F60C4",
80
+ "fill": 0,
81
+ "stack": False,
82
+ },
83
+ {
84
+ "$$hashKey": "object:78",
85
+ "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
86
+ "hiddenSeries": True,
87
+ },
88
+ {
89
+ "$$hashKey": "object:2987",
90
+ "alias": "MAX + PENDING",
91
+ "dashes": True,
92
+ "color": "#777777",
93
+ "fill": 0,
94
+ "stack": False,
95
+ },
96
+ ],
97
+ "spaceLength": 10,
98
+ "stack": True,
99
+ "steppedLine": False,
100
+ "targets": [],
101
+ "thresholds": [],
102
+ "timeFrom": None,
103
+ "timeRegions": [],
104
+ "timeShift": None,
105
+ "title": "<Title>",
106
+ "tooltip": {"shared": True, "sort": 0, "value_type": "individual"},
107
+ "type": "graph",
108
+ "xaxis": {
109
+ "buckets": None,
110
+ "mode": "time",
111
+ "name": None,
112
+ "show": True,
113
+ "values": [],
114
+ },
115
+ "yaxes": [
116
+ {
117
+ "$$hashKey": "object:628",
118
+ "format": "units",
119
+ "label": "",
120
+ "logBase": 1,
121
+ "max": None,
122
+ "min": "0",
123
+ "show": True,
124
+ },
125
+ {
126
+ "$$hashKey": "object:629",
127
+ "format": "short",
128
+ "label": None,
129
+ "logBase": 1,
130
+ "max": None,
131
+ "min": None,
132
+ "show": True,
133
+ },
134
+ ],
135
+ "yaxis": {"align": False, "alignLevel": None},
136
+ }
137
+
138
+
139
+ def _read_configs_for_dashboard(
140
+ dashboard_config: DashboardConfig,
141
+ ) -> Tuple[str, List[str]]:
142
+ """
143
+ Reads environment variable configs for overriding uid or global_filters for a given
144
+ dashboard.
145
+
146
+ Returns:
147
+ Tuple with format uid, global_filters
148
+ """
149
+ uid = (
150
+ os.environ.get(
151
+ GRAFANA_DASHBOARD_UID_OVERRIDE_ENV_VAR_TEMPLATE.format(
152
+ name=dashboard_config.name
153
+ )
154
+ )
155
+ or dashboard_config.default_uid
156
+ )
157
+ global_filters_str = (
158
+ os.environ.get(
159
+ GRAFANA_DASHBOARD_GLOBAL_FILTERS_OVERRIDE_ENV_VAR_TEMPLATE.format(
160
+ name=dashboard_config.name
161
+ )
162
+ )
163
+ or ""
164
+ )
165
+ global_filters = global_filters_str.split(",")
166
+
167
+ return uid, global_filters
168
+
169
+
170
+ def generate_default_grafana_dashboard() -> Tuple[str, str]:
171
+ """
172
+ Generates the dashboard output for the default dashboard and returns
173
+ both the content and the uid.
174
+
175
+ Returns:
176
+ Tuple with format content, uid
177
+ """
178
+ return _generate_grafana_dashboard(default_dashboard_config)
179
+
180
+
181
+ def generate_serve_grafana_dashboard() -> Tuple[str, str]:
182
+ """
183
+ Generates the dashboard output for the serve dashboard and returns
184
+ both the content and the uid.
185
+
186
+ Returns:
187
+ Tuple with format content, uid
188
+ """
189
+ return _generate_grafana_dashboard(serve_dashboard_config)
190
+
191
+
192
+ def generate_serve_deployment_grafana_dashboard() -> Tuple[str, str]:
193
+ """
194
+ Generates the dashboard output for the serve dashboard and returns
195
+ both the content and the uid.
196
+
197
+ Returns:
198
+ Tuple with format content, uid
199
+ """
200
+ return _generate_grafana_dashboard(serve_deployment_dashboard_config)
201
+
202
+
203
+ def generate_data_grafana_dashboard() -> Tuple[str, str]:
204
+ """
205
+ Generates the dashboard output for the data dashboard and returns
206
+ both the content and the uid.
207
+
208
+ Returns:
209
+ Tuple with format content, uid
210
+ """
211
+ return _generate_grafana_dashboard(data_dashboard_config)
212
+
213
+
214
+ def _generate_grafana_dashboard(dashboard_config: DashboardConfig) -> str:
215
+ """
216
+ Returns:
217
+ Tuple with format dashboard_content, uid
218
+ """
219
+ uid, global_filters = _read_configs_for_dashboard(dashboard_config)
220
+ panels = _generate_grafana_panels(dashboard_config, global_filters)
221
+ base_file_name = dashboard_config.base_json_file_name
222
+
223
+ base_json = json.load(
224
+ open(os.path.join(os.path.dirname(__file__), "dashboards", base_file_name))
225
+ )
226
+ base_json["panels"] = panels
227
+ # Update variables to use global_filters
228
+ global_filters_str = ",".join(global_filters)
229
+ variables = base_json.get("templating", {}).get("list", [])
230
+ for variable in variables:
231
+ if "definition" not in variable:
232
+ continue
233
+ variable["definition"] = variable["definition"].format(
234
+ global_filters=global_filters_str
235
+ )
236
+ variable["query"]["query"] = variable["query"]["query"].format(
237
+ global_filters=global_filters_str
238
+ )
239
+
240
+ tags = base_json.get("tags", []) or []
241
+ tags.append(f"rayVersion:{ray.__version__}")
242
+ base_json["tags"] = tags
243
+ base_json["uid"] = uid
244
+ # Ray metadata can be used to put arbitrary metadata
245
+ ray_meta = base_json.get("rayMeta", []) or []
246
+ ray_meta.append("supportsGlobalFilterOverride")
247
+ base_json["rayMeta"] = ray_meta
248
+ return json.dumps(base_json, indent=4), uid
249
+
250
+
251
+ def _generate_grafana_panels(
252
+ config: DashboardConfig, global_filters: List[str]
253
+ ) -> List[dict]:
254
+ out = []
255
+ panel_global_filters = [*config.standard_global_filters, *global_filters]
256
+ for i, panel in enumerate(config.panels):
257
+ template = copy.deepcopy(PANEL_TEMPLATE)
258
+ template.update(
259
+ {
260
+ "title": panel.title,
261
+ "description": panel.description,
262
+ "id": panel.id,
263
+ "targets": _generate_targets(panel, panel_global_filters),
264
+ }
265
+ )
266
+ if panel.grid_pos:
267
+ template["gridPos"] = asdict(panel.grid_pos)
268
+ else:
269
+ template["gridPos"]["y"] = i // 2
270
+ template["gridPos"]["x"] = 12 * (i % 2)
271
+ template["yaxes"][0]["format"] = panel.unit
272
+ template["fill"] = panel.fill
273
+ template["stack"] = panel.stack
274
+ template["linewidth"] = panel.linewidth
275
+ out.append(template)
276
+ return out
277
+
278
+
279
+ def gen_incrementing_alphabets(length):
280
+ assert 65 + length < 96, "we only support up to 26 targets at a time."
281
+ # 65: ascii code of 'A'.
282
+ return list(map(chr, range(65, 65 + length)))
283
+
284
+
285
+ def _generate_targets(panel: Panel, panel_global_filters: List[str]) -> List[dict]:
286
+ targets = []
287
+ for target, ref_id in zip(
288
+ panel.targets, gen_incrementing_alphabets(len(panel.targets))
289
+ ):
290
+ template = copy.deepcopy(TARGET_TEMPLATE)
291
+ template.update(
292
+ {
293
+ "expr": target.expr.format(
294
+ global_filters=",".join(panel_global_filters)
295
+ ),
296
+ "legendFormat": target.legend,
297
+ "refId": ref_id,
298
+ }
299
+ )
300
+ targets.append(template)
301
+ return targets