Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- .venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/consts.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/dashboard.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/dashboard_metrics.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/datacenter.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/http_server_agent.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/http_server_head.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/k8s_utils.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/memory_utils.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/optional_utils.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/routes.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/state_api_utils.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/timezone_utils.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/client/build/static/media/roboto-latin-100.c2aa4ab115bf9c6057cb.woff2 +0 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/client/build/static/media/roboto-latin-100italic.7f839a8652da29745ce4.woff2 +0 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/client/build/static/media/roboto-latin-300.37a7069dc30fc663c878.woff2 +0 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/client/build/static/media/roboto-latin-300italic.bd5b7a13f2c52b531a2a.woff +0 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/client/build/static/media/roboto-latin-300italic.c64e7e354c88e613c77c.woff2 +0 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/client/build/static/media/roboto-latin-500.f5b74d7ffcdf85b9dd60.woff2 +0 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/client/build/static/media/roboto-latin-500italic.0d8bb5b3ee5f5dac9e44.woff2 +0 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/client/build/static/media/roboto-latin-700.c18ee39fb002ad58b6dc.woff2 +0 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/client/build/static/media/roboto-latin-700italic.7d8125ff7f707231fd89.woff2 +0 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/modules/actor/__init__.py +0 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/modules/actor/__pycache__/__init__.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/modules/actor/__pycache__/actor_consts.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/modules/actor/__pycache__/actor_head.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/modules/actor/actor_consts.py +5 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/modules/actor/actor_head.py +290 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/modules/event/__init__.py +0 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/modules/event/__pycache__/__init__.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/modules/event/__pycache__/event_agent.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/modules/event/__pycache__/event_consts.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/modules/event/__pycache__/event_head.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/modules/event/__pycache__/event_utils.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/modules/event/event_agent.py +133 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/modules/event/event_consts.py +21 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/modules/event/event_head.py +212 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/__init__.py +0 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/__pycache__/__init__.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/__pycache__/grafana_dashboard_factory.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/__pycache__/metrics_head.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/__pycache__/templates.cpython-311.pyc +0 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/dashboards/common.py +70 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/dashboards/data_dashboard_panels.py +551 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/dashboards/default_dashboard_panels.py +478 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/dashboards/serve_dashboard_panels.py +420 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/dashboards/serve_deployment_dashboard_panels.py +259 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/dashboards/serve_deployment_grafana_dashboard_base.json +223 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/dashboards/serve_grafana_dashboard_base.json +188 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/export/prometheus/prometheus.yml +12 -0
- .venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/grafana_dashboard_factory.py +301 -0
.venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/consts.cpython-311.pyc
ADDED
|
Binary file (3.41 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/dashboard.cpython-311.pyc
ADDED
|
Binary file (10.2 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/dashboard_metrics.cpython-311.pyc
ADDED
|
Binary file (4.28 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/datacenter.cpython-311.pyc
ADDED
|
Binary file (14 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/http_server_agent.cpython-311.pyc
ADDED
|
Binary file (4.98 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/http_server_head.cpython-311.pyc
ADDED
|
Binary file (16.5 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/k8s_utils.cpython-311.pyc
ADDED
|
Binary file (5.35 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/memory_utils.cpython-311.pyc
ADDED
|
Binary file (24.6 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/optional_utils.cpython-311.pyc
ADDED
|
Binary file (9.41 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/routes.cpython-311.pyc
ADDED
|
Binary file (10.7 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/state_api_utils.cpython-311.pyc
ADDED
|
Binary file (11.4 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/timezone_utils.cpython-311.pyc
ADDED
|
Binary file (3.34 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/dashboard/client/build/static/media/roboto-latin-100.c2aa4ab115bf9c6057cb.woff2
ADDED
|
Binary file (15.8 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/dashboard/client/build/static/media/roboto-latin-100italic.7f839a8652da29745ce4.woff2
ADDED
|
Binary file (17 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/dashboard/client/build/static/media/roboto-latin-300.37a7069dc30fc663c878.woff2
ADDED
|
Binary file (15.8 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/dashboard/client/build/static/media/roboto-latin-300italic.bd5b7a13f2c52b531a2a.woff
ADDED
|
Binary file (22.2 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/dashboard/client/build/static/media/roboto-latin-300italic.c64e7e354c88e613c77c.woff2
ADDED
|
Binary file (17.4 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/dashboard/client/build/static/media/roboto-latin-500.f5b74d7ffcdf85b9dd60.woff2
ADDED
|
Binary file (15.9 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/dashboard/client/build/static/media/roboto-latin-500italic.0d8bb5b3ee5f5dac9e44.woff2
ADDED
|
Binary file (17.3 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/dashboard/client/build/static/media/roboto-latin-700.c18ee39fb002ad58b6dc.woff2
ADDED
|
Binary file (15.8 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/dashboard/client/build/static/media/roboto-latin-700italic.7d8125ff7f707231fd89.woff2
ADDED
|
Binary file (17 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/dashboard/modules/actor/__init__.py
ADDED
|
File without changes
|
.venv/lib/python3.11/site-packages/ray/dashboard/modules/actor/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (200 Bytes). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/dashboard/modules/actor/__pycache__/actor_consts.cpython-311.pyc
ADDED
|
Binary file (459 Bytes). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/dashboard/modules/actor/__pycache__/actor_head.cpython-311.pyc
ADDED
|
Binary file (13.9 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/dashboard/modules/actor/actor_consts.py
ADDED
|
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import ray
|
| 2 |
+
|
| 3 |
+
ACTOR_CHANNEL = "ACTOR"
|
| 4 |
+
NIL_NODE_ID = ray.NodeID.nil().hex()
|
| 5 |
+
RETRY_GET_ALL_ACTOR_INFO_INTERVAL_SECONDS = 1
|
.venv/lib/python3.11/site-packages/ray/dashboard/modules/actor/actor_head.py
ADDED
|
@@ -0,0 +1,290 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
import logging
|
| 3 |
+
from collections import defaultdict, deque
|
| 4 |
+
from concurrent.futures import ThreadPoolExecutor
|
| 5 |
+
from typing import Any, Dict
|
| 6 |
+
|
| 7 |
+
import aiohttp.web
|
| 8 |
+
|
| 9 |
+
import ray
|
| 10 |
+
import ray.dashboard.optional_utils as dashboard_optional_utils
|
| 11 |
+
import ray.dashboard.utils as dashboard_utils
|
| 12 |
+
from ray._private.gcs_pubsub import GcsAioActorSubscriber
|
| 13 |
+
from ray._private.utils import get_or_create_event_loop
|
| 14 |
+
from ray.dashboard.consts import GCS_RPC_TIMEOUT_SECONDS
|
| 15 |
+
from ray.dashboard.datacenter import DataOrganizer, DataSource
|
| 16 |
+
from ray.dashboard.modules.actor import actor_consts
|
| 17 |
+
|
| 18 |
+
logger = logging.getLogger(__name__)
|
| 19 |
+
routes = dashboard_optional_utils.DashboardHeadRouteTable
|
| 20 |
+
|
| 21 |
+
MAX_DESTROYED_ACTORS_TO_CACHE = max(
|
| 22 |
+
0, ray._config.maximum_gcs_destroyed_actor_cached_count()
|
| 23 |
+
)
|
| 24 |
+
|
| 25 |
+
ACTOR_CLEANUP_FREQUENCY = 1 # seconds
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
ACTOR_TABLE_STATE_COLUMNS = (
|
| 29 |
+
"state",
|
| 30 |
+
"address",
|
| 31 |
+
"numRestarts",
|
| 32 |
+
"timestamp",
|
| 33 |
+
"pid",
|
| 34 |
+
"exitDetail",
|
| 35 |
+
"startTime",
|
| 36 |
+
"endTime",
|
| 37 |
+
"reprName",
|
| 38 |
+
)
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def actor_table_data_to_dict(message):
|
| 42 |
+
orig_message = dashboard_utils.message_to_dict(
|
| 43 |
+
message,
|
| 44 |
+
{
|
| 45 |
+
"actorId",
|
| 46 |
+
"parentId",
|
| 47 |
+
"jobId",
|
| 48 |
+
"workerId",
|
| 49 |
+
"rayletId",
|
| 50 |
+
"callerId",
|
| 51 |
+
"taskId",
|
| 52 |
+
"parentTaskId",
|
| 53 |
+
"sourceActorId",
|
| 54 |
+
"placementGroupId",
|
| 55 |
+
},
|
| 56 |
+
always_print_fields_with_no_presence=True,
|
| 57 |
+
)
|
| 58 |
+
# The complete schema for actor table is here:
|
| 59 |
+
# src/ray/protobuf/gcs.proto
|
| 60 |
+
# It is super big and for dashboard, we don't need that much information.
|
| 61 |
+
# Only preserve the necessary ones here for memory usage.
|
| 62 |
+
fields = {
|
| 63 |
+
"actorId",
|
| 64 |
+
"jobId",
|
| 65 |
+
"pid",
|
| 66 |
+
"address",
|
| 67 |
+
"state",
|
| 68 |
+
"name",
|
| 69 |
+
"numRestarts",
|
| 70 |
+
"timestamp",
|
| 71 |
+
"className",
|
| 72 |
+
"startTime",
|
| 73 |
+
"endTime",
|
| 74 |
+
"reprName",
|
| 75 |
+
"placementGroupId",
|
| 76 |
+
"callSite",
|
| 77 |
+
}
|
| 78 |
+
light_message = {k: v for (k, v) in orig_message.items() if k in fields}
|
| 79 |
+
light_message["actorClass"] = orig_message["className"]
|
| 80 |
+
exit_detail = "-"
|
| 81 |
+
if "deathCause" in orig_message:
|
| 82 |
+
context = orig_message["deathCause"]
|
| 83 |
+
if "actorDiedErrorContext" in context:
|
| 84 |
+
exit_detail = context["actorDiedErrorContext"]["errorMessage"] # noqa
|
| 85 |
+
elif "runtimeEnvFailedContext" in context:
|
| 86 |
+
exit_detail = context["runtimeEnvFailedContext"]["errorMessage"] # noqa
|
| 87 |
+
elif "actorUnschedulableContext" in context:
|
| 88 |
+
exit_detail = context["actorUnschedulableContext"]["errorMessage"] # noqa
|
| 89 |
+
elif "creationTaskFailureContext" in context:
|
| 90 |
+
exit_detail = context["creationTaskFailureContext"][
|
| 91 |
+
"formattedExceptionString"
|
| 92 |
+
] # noqa
|
| 93 |
+
light_message["exitDetail"] = exit_detail
|
| 94 |
+
light_message["startTime"] = int(light_message["startTime"])
|
| 95 |
+
light_message["endTime"] = int(light_message["endTime"])
|
| 96 |
+
light_message["requiredResources"] = dict(message.required_resources)
|
| 97 |
+
|
| 98 |
+
return light_message
|
| 99 |
+
|
| 100 |
+
|
| 101 |
+
class ActorHead(dashboard_utils.DashboardHeadModule):
|
| 102 |
+
def __init__(self, config: dashboard_utils.DashboardHeadModuleConfig):
|
| 103 |
+
super().__init__(config)
|
| 104 |
+
|
| 105 |
+
self._gcs_actor_channel_subscriber = None
|
| 106 |
+
# A queue of dead actors in order of when they died
|
| 107 |
+
self.destroyed_actors_queue = deque()
|
| 108 |
+
|
| 109 |
+
# -- Internal state --
|
| 110 |
+
self._loop = get_or_create_event_loop()
|
| 111 |
+
# NOTE: This executor is intentionally constrained to just 1 thread to
|
| 112 |
+
# limit its concurrency, therefore reducing potential for GIL contention
|
| 113 |
+
self._executor = ThreadPoolExecutor(
|
| 114 |
+
max_workers=1, thread_name_prefix="actor_head_executor"
|
| 115 |
+
)
|
| 116 |
+
|
| 117 |
+
async def _update_actors(self):
|
| 118 |
+
"""
|
| 119 |
+
Processes actor info. First gets all actors from GCS, then subscribes to
|
| 120 |
+
actor updates. For each actor update, updates DataSource.node_actors and
|
| 121 |
+
DataSource.actors.
|
| 122 |
+
"""
|
| 123 |
+
|
| 124 |
+
# To prevent Time-of-check to time-of-use issue [1], the get-all-actor-info
|
| 125 |
+
# happens after the subscription. That is, an update between get-all-actor-info
|
| 126 |
+
# and the subscription is not missed.
|
| 127 |
+
#
|
| 128 |
+
# [1] https://en.wikipedia.org/wiki/Time-of-check_to_time-of-use
|
| 129 |
+
gcs_addr = self.gcs_address
|
| 130 |
+
actor_channel_subscriber = GcsAioActorSubscriber(address=gcs_addr)
|
| 131 |
+
await actor_channel_subscriber.subscribe()
|
| 132 |
+
|
| 133 |
+
# Get all actor info.
|
| 134 |
+
while True:
|
| 135 |
+
try:
|
| 136 |
+
logger.info("Getting all actor info from GCS.")
|
| 137 |
+
|
| 138 |
+
actor_dicts = await self._get_all_actors()
|
| 139 |
+
# Update actors
|
| 140 |
+
DataSource.actors.reset(actor_dicts)
|
| 141 |
+
|
| 142 |
+
# Update node actors and job actors.
|
| 143 |
+
node_actors = defaultdict(dict)
|
| 144 |
+
for actor_id_bytes, updated_actor_table in actor_dicts.items():
|
| 145 |
+
node_id = updated_actor_table["address"]["rayletId"]
|
| 146 |
+
# Update only when node_id is not Nil.
|
| 147 |
+
if node_id != actor_consts.NIL_NODE_ID:
|
| 148 |
+
node_actors[node_id][actor_id_bytes] = updated_actor_table
|
| 149 |
+
|
| 150 |
+
# Update node's actor info
|
| 151 |
+
DataSource.node_actors.reset(node_actors)
|
| 152 |
+
|
| 153 |
+
logger.info("Received %d actor info from GCS.", len(actor_dicts))
|
| 154 |
+
|
| 155 |
+
# Break, once all initial actors are successfully fetched
|
| 156 |
+
break
|
| 157 |
+
except Exception as e:
|
| 158 |
+
logger.exception("Error Getting all actor info from GCS", exc_info=e)
|
| 159 |
+
await asyncio.sleep(
|
| 160 |
+
actor_consts.RETRY_GET_ALL_ACTOR_INFO_INTERVAL_SECONDS
|
| 161 |
+
)
|
| 162 |
+
|
| 163 |
+
# Pull incremental updates from the GCS channel
|
| 164 |
+
while True:
|
| 165 |
+
try:
|
| 166 |
+
updated_actor_table_entries = await self._poll_updated_actor_table_data(
|
| 167 |
+
actor_channel_subscriber
|
| 168 |
+
)
|
| 169 |
+
|
| 170 |
+
for (
|
| 171 |
+
actor_id,
|
| 172 |
+
updated_actor_table,
|
| 173 |
+
) in updated_actor_table_entries.items():
|
| 174 |
+
self._process_updated_actor_table(actor_id, updated_actor_table)
|
| 175 |
+
|
| 176 |
+
# TODO emit metrics
|
| 177 |
+
logger.debug(
|
| 178 |
+
f"Total events processed: {len(updated_actor_table_entries)}, "
|
| 179 |
+
f"queue size: {actor_channel_subscriber.queue_size}"
|
| 180 |
+
)
|
| 181 |
+
|
| 182 |
+
except Exception as e:
|
| 183 |
+
logger.exception("Error processing actor info from GCS.", exc_info=e)
|
| 184 |
+
|
| 185 |
+
async def _poll_updated_actor_table_data(
|
| 186 |
+
self, actor_channel_subscriber: GcsAioActorSubscriber
|
| 187 |
+
) -> Dict[str, Dict[str, Any]]:
|
| 188 |
+
# TODO make batch size configurable
|
| 189 |
+
batch = await actor_channel_subscriber.poll(batch_size=200)
|
| 190 |
+
|
| 191 |
+
# NOTE: We're offloading conversion to a TPE to make sure we're not
|
| 192 |
+
# blocking the event-loop for prolonged period of time irrespective
|
| 193 |
+
# of the batch size
|
| 194 |
+
def _convert_to_dict():
|
| 195 |
+
return {
|
| 196 |
+
actor_id_bytes.hex(): actor_table_data_to_dict(actor_table_data_message)
|
| 197 |
+
for actor_id_bytes, actor_table_data_message in batch
|
| 198 |
+
if actor_id_bytes is not None
|
| 199 |
+
}
|
| 200 |
+
|
| 201 |
+
return await self._loop.run_in_executor(self._executor, _convert_to_dict)
|
| 202 |
+
|
| 203 |
+
def _process_updated_actor_table(
|
| 204 |
+
self, actor_id: str, actor_table_data: Dict[str, Any]
|
| 205 |
+
):
|
| 206 |
+
"""NOTE: This method has to be executed on the event-loop, provided that it
|
| 207 |
+
accesses DataSource data structures (to follow its thread-safety model)"""
|
| 208 |
+
|
| 209 |
+
# If actor is not new registered but updated, we only update
|
| 210 |
+
# states related fields.
|
| 211 |
+
actor = DataSource.actors.get(actor_id)
|
| 212 |
+
|
| 213 |
+
if actor and actor_table_data["state"] != "DEPENDENCIES_UNREADY":
|
| 214 |
+
for k in ACTOR_TABLE_STATE_COLUMNS:
|
| 215 |
+
if k in actor_table_data:
|
| 216 |
+
actor[k] = actor_table_data[k]
|
| 217 |
+
actor_table_data = actor
|
| 218 |
+
|
| 219 |
+
actor_id = actor_table_data["actorId"]
|
| 220 |
+
node_id = actor_table_data["address"]["rayletId"]
|
| 221 |
+
|
| 222 |
+
if actor_table_data["state"] == "DEAD":
|
| 223 |
+
self.destroyed_actors_queue.append(actor_id)
|
| 224 |
+
|
| 225 |
+
# Update actors.
|
| 226 |
+
DataSource.actors[actor_id] = actor_table_data
|
| 227 |
+
# Update node actors (only when node_id is not Nil).
|
| 228 |
+
if node_id != actor_consts.NIL_NODE_ID:
|
| 229 |
+
node_actors = DataSource.node_actors.get(node_id, {})
|
| 230 |
+
node_actors[actor_id] = actor_table_data
|
| 231 |
+
DataSource.node_actors[node_id] = node_actors
|
| 232 |
+
|
| 233 |
+
async def _get_all_actors(self) -> Dict[str, dict]:
|
| 234 |
+
actors = await self.gcs_aio_client.get_all_actor_info(
|
| 235 |
+
timeout=GCS_RPC_TIMEOUT_SECONDS
|
| 236 |
+
)
|
| 237 |
+
|
| 238 |
+
# NOTE: We're offloading conversion to a TPE to make sure we're not
|
| 239 |
+
# blocking the event-loop for prolonged period of time for large clusters
|
| 240 |
+
def _convert_to_dict():
|
| 241 |
+
return {
|
| 242 |
+
actor_id.hex(): actor_table_data_to_dict(actor_table_data)
|
| 243 |
+
for actor_id, actor_table_data in actors.items()
|
| 244 |
+
}
|
| 245 |
+
|
| 246 |
+
return await self._loop.run_in_executor(self._executor, _convert_to_dict)
|
| 247 |
+
|
| 248 |
+
async def _cleanup_actors(self):
|
| 249 |
+
while True:
|
| 250 |
+
try:
|
| 251 |
+
while len(self.destroyed_actors_queue) > MAX_DESTROYED_ACTORS_TO_CACHE:
|
| 252 |
+
actor_id = self.destroyed_actors_queue.popleft()
|
| 253 |
+
if actor_id in DataSource.actors:
|
| 254 |
+
actor = DataSource.actors.pop(actor_id)
|
| 255 |
+
node_id = actor["address"].get("rayletId")
|
| 256 |
+
if node_id and node_id != actor_consts.NIL_NODE_ID:
|
| 257 |
+
del DataSource.node_actors[node_id][actor_id]
|
| 258 |
+
await asyncio.sleep(ACTOR_CLEANUP_FREQUENCY)
|
| 259 |
+
except Exception:
|
| 260 |
+
logger.exception("Error cleaning up actor info from GCS.")
|
| 261 |
+
|
| 262 |
+
@routes.get("/logical/actors")
|
| 263 |
+
@dashboard_optional_utils.aiohttp_cache
|
| 264 |
+
async def get_all_actors(self, req) -> aiohttp.web.Response:
|
| 265 |
+
actors = await DataOrganizer.get_actor_infos()
|
| 266 |
+
return dashboard_optional_utils.rest_response(
|
| 267 |
+
success=True,
|
| 268 |
+
message="All actors fetched.",
|
| 269 |
+
actors=actors,
|
| 270 |
+
# False to avoid converting Ray resource name to google style.
|
| 271 |
+
# It's not necessary here because the fields are already
|
| 272 |
+
# google formatted when protobuf was converted into dict.
|
| 273 |
+
convert_google_style=False,
|
| 274 |
+
)
|
| 275 |
+
|
| 276 |
+
@routes.get("/logical/actors/{actor_id}")
|
| 277 |
+
@dashboard_optional_utils.aiohttp_cache
|
| 278 |
+
async def get_actor(self, req) -> aiohttp.web.Response:
|
| 279 |
+
actor_id = req.match_info.get("actor_id")
|
| 280 |
+
actors = await DataOrganizer.get_actor_infos(actor_ids=[actor_id])
|
| 281 |
+
return dashboard_optional_utils.rest_response(
|
| 282 |
+
success=True, message="Actor details fetched.", detail=actors[actor_id]
|
| 283 |
+
)
|
| 284 |
+
|
| 285 |
+
async def run(self, server):
|
| 286 |
+
await asyncio.gather(self._update_actors(), self._cleanup_actors())
|
| 287 |
+
|
| 288 |
+
@staticmethod
|
| 289 |
+
def is_minimal_module():
|
| 290 |
+
return False
|
.venv/lib/python3.11/site-packages/ray/dashboard/modules/event/__init__.py
ADDED
|
File without changes
|
.venv/lib/python3.11/site-packages/ray/dashboard/modules/event/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (200 Bytes). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/dashboard/modules/event/__pycache__/event_agent.cpython-311.pyc
ADDED
|
Binary file (8.09 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/dashboard/modules/event/__pycache__/event_consts.cpython-311.pyc
ADDED
|
Binary file (1.09 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/dashboard/modules/event/__pycache__/event_head.cpython-311.pyc
ADDED
|
Binary file (12.2 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/dashboard/modules/event/__pycache__/event_utils.cpython-311.pyc
ADDED
|
Binary file (11.6 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/dashboard/modules/event/event_agent.py
ADDED
|
@@ -0,0 +1,133 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
import logging
|
| 3 |
+
import os
|
| 4 |
+
import time
|
| 5 |
+
from concurrent.futures import ThreadPoolExecutor
|
| 6 |
+
from typing import Union
|
| 7 |
+
|
| 8 |
+
import ray._private.ray_constants as ray_constants
|
| 9 |
+
import ray._private.utils as utils
|
| 10 |
+
import ray.dashboard.consts as dashboard_consts
|
| 11 |
+
import ray.dashboard.utils as dashboard_utils
|
| 12 |
+
from ray.core.generated import event_pb2, event_pb2_grpc
|
| 13 |
+
from ray.dashboard.modules.event import event_consts
|
| 14 |
+
from ray.dashboard.modules.event.event_utils import monitor_events
|
| 15 |
+
from ray.dashboard.utils import async_loop_forever, create_task
|
| 16 |
+
|
| 17 |
+
logger = logging.getLogger(__name__)
|
| 18 |
+
|
| 19 |
+
|
| 20 |
+
# NOTE: Executor in this head is intentionally constrained to just 1 thread by
|
| 21 |
+
# default to limit its concurrency, therefore reducing potential for
|
| 22 |
+
# GIL contention
|
| 23 |
+
RAY_DASHBOARD_EVENT_AGENT_TPE_MAX_WORKERS = ray_constants.env_integer(
|
| 24 |
+
"RAY_DASHBOARD_EVENT_AGENT_TPE_MAX_WORKERS", 1
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
|
| 28 |
+
class EventAgent(dashboard_utils.DashboardAgentModule):
|
| 29 |
+
def __init__(self, dashboard_agent):
|
| 30 |
+
super().__init__(dashboard_agent)
|
| 31 |
+
self._event_dir = os.path.join(self._dashboard_agent.log_dir, "events")
|
| 32 |
+
os.makedirs(self._event_dir, exist_ok=True)
|
| 33 |
+
self._monitor: Union[asyncio.Task, None] = None
|
| 34 |
+
self._stub: Union[event_pb2_grpc.ReportEventServiceStub, None] = None
|
| 35 |
+
self._cached_events = asyncio.Queue(event_consts.EVENT_AGENT_CACHE_SIZE)
|
| 36 |
+
self._gcs_aio_client = dashboard_agent.gcs_aio_client
|
| 37 |
+
# Total number of event created from this agent.
|
| 38 |
+
self.total_event_reported = 0
|
| 39 |
+
# Total number of event report request sent.
|
| 40 |
+
self.total_request_sent = 0
|
| 41 |
+
self.module_started = time.monotonic()
|
| 42 |
+
|
| 43 |
+
self._executor = ThreadPoolExecutor(
|
| 44 |
+
max_workers=RAY_DASHBOARD_EVENT_AGENT_TPE_MAX_WORKERS,
|
| 45 |
+
thread_name_prefix="event_agent_executor",
|
| 46 |
+
)
|
| 47 |
+
|
| 48 |
+
logger.info("Event agent cache buffer size: %s", self._cached_events.maxsize)
|
| 49 |
+
|
| 50 |
+
async def _connect_to_dashboard(self):
|
| 51 |
+
"""Connect to the dashboard. If the dashboard is not started, then
|
| 52 |
+
this method will never returns.
|
| 53 |
+
|
| 54 |
+
Returns:
|
| 55 |
+
The ReportEventServiceStub object.
|
| 56 |
+
"""
|
| 57 |
+
while True:
|
| 58 |
+
try:
|
| 59 |
+
dashboard_rpc_address = await self._gcs_aio_client.internal_kv_get(
|
| 60 |
+
dashboard_consts.DASHBOARD_RPC_ADDRESS.encode(),
|
| 61 |
+
namespace=ray_constants.KV_NAMESPACE_DASHBOARD,
|
| 62 |
+
timeout=1,
|
| 63 |
+
)
|
| 64 |
+
dashboard_rpc_address = dashboard_rpc_address.decode()
|
| 65 |
+
if dashboard_rpc_address:
|
| 66 |
+
logger.info("Report events to %s", dashboard_rpc_address)
|
| 67 |
+
options = ray_constants.GLOBAL_GRPC_OPTIONS
|
| 68 |
+
channel = utils.init_grpc_channel(
|
| 69 |
+
dashboard_rpc_address, options=options, asynchronous=True
|
| 70 |
+
)
|
| 71 |
+
return event_pb2_grpc.ReportEventServiceStub(channel)
|
| 72 |
+
except Exception:
|
| 73 |
+
logger.exception("Connect to dashboard failed.")
|
| 74 |
+
await asyncio.sleep(
|
| 75 |
+
event_consts.RETRY_CONNECT_TO_DASHBOARD_INTERVAL_SECONDS
|
| 76 |
+
)
|
| 77 |
+
|
| 78 |
+
@async_loop_forever(event_consts.EVENT_AGENT_REPORT_INTERVAL_SECONDS)
|
| 79 |
+
async def report_events(self):
|
| 80 |
+
"""Report events from cached events queue. Reconnect to dashboard if
|
| 81 |
+
report failed. Log error after retry EVENT_AGENT_RETRY_TIMES.
|
| 82 |
+
|
| 83 |
+
This method will never returns.
|
| 84 |
+
"""
|
| 85 |
+
data = await self._cached_events.get()
|
| 86 |
+
self.total_event_reported += len(data)
|
| 87 |
+
for _ in range(event_consts.EVENT_AGENT_RETRY_TIMES):
|
| 88 |
+
try:
|
| 89 |
+
logger.debug("Report %s events.", len(data))
|
| 90 |
+
request = event_pb2.ReportEventsRequest(event_strings=data)
|
| 91 |
+
await self._stub.ReportEvents(request)
|
| 92 |
+
self.total_request_sent += 1
|
| 93 |
+
break
|
| 94 |
+
except Exception:
|
| 95 |
+
logger.exception("Report event failed, reconnect to the " "dashboard.")
|
| 96 |
+
self._stub = await self._connect_to_dashboard()
|
| 97 |
+
else:
|
| 98 |
+
data_str = str(data)
|
| 99 |
+
limit = event_consts.LOG_ERROR_EVENT_STRING_LENGTH_LIMIT
|
| 100 |
+
logger.error(
|
| 101 |
+
"Report event failed: %s",
|
| 102 |
+
data_str[:limit] + (data_str[limit:] and "..."),
|
| 103 |
+
)
|
| 104 |
+
|
| 105 |
+
async def get_internal_states(self):
|
| 106 |
+
if self.total_event_reported <= 0 or self.total_request_sent <= 0:
|
| 107 |
+
return
|
| 108 |
+
|
| 109 |
+
elapsed = time.monotonic() - self.module_started
|
| 110 |
+
return {
|
| 111 |
+
"total_events_reported": self.total_event_reported,
|
| 112 |
+
"Total_report_request": self.total_request_sent,
|
| 113 |
+
"queue_size": self._cached_events.qsize(),
|
| 114 |
+
"total_uptime": elapsed,
|
| 115 |
+
}
|
| 116 |
+
|
| 117 |
+
async def run(self, server):
|
| 118 |
+
# Connect to dashboard.
|
| 119 |
+
self._stub = await self._connect_to_dashboard()
|
| 120 |
+
# Start monitor task.
|
| 121 |
+
self._monitor = monitor_events(
|
| 122 |
+
self._event_dir,
|
| 123 |
+
lambda data: create_task(self._cached_events.put(data)),
|
| 124 |
+
self._executor,
|
| 125 |
+
)
|
| 126 |
+
|
| 127 |
+
await asyncio.gather(
|
| 128 |
+
self.report_events(),
|
| 129 |
+
)
|
| 130 |
+
|
| 131 |
+
@staticmethod
|
| 132 |
+
def is_minimal_module():
|
| 133 |
+
return False
|
.venv/lib/python3.11/site-packages/ray/dashboard/modules/event/event_consts.py
ADDED
|
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from ray._private.ray_constants import env_float, env_integer
|
| 2 |
+
from ray.core.generated import event_pb2
|
| 3 |
+
|
| 4 |
+
LOG_ERROR_EVENT_STRING_LENGTH_LIMIT = 1000
|
| 5 |
+
RETRY_CONNECT_TO_DASHBOARD_INTERVAL_SECONDS = 2
|
| 6 |
+
# Monitor events
|
| 7 |
+
SCAN_EVENT_DIR_INTERVAL_SECONDS = env_integer("SCAN_EVENT_DIR_INTERVAL_SECONDS", 2)
|
| 8 |
+
SCAN_EVENT_START_OFFSET_SECONDS = -30 * 60
|
| 9 |
+
CONCURRENT_READ_LIMIT = 50
|
| 10 |
+
EVENT_READ_LINE_COUNT_LIMIT = 200
|
| 11 |
+
EVENT_READ_LINE_LENGTH_LIMIT = env_integer(
|
| 12 |
+
"EVENT_READ_LINE_LENGTH_LIMIT", 2 * 1024 * 1024
|
| 13 |
+
) # 2MB
|
| 14 |
+
# Report events
|
| 15 |
+
EVENT_AGENT_REPORT_INTERVAL_SECONDS = env_float(
|
| 16 |
+
"EVENT_AGENT_REPORT_INTERVAL_SECONDS", 0.1
|
| 17 |
+
)
|
| 18 |
+
EVENT_AGENT_RETRY_TIMES = 10
|
| 19 |
+
EVENT_AGENT_CACHE_SIZE = 10240
|
| 20 |
+
# Event sources
|
| 21 |
+
EVENT_SOURCE_ALL = event_pb2.Event.SourceType.keys()
|
.venv/lib/python3.11/site-packages/ray/dashboard/modules/event/event_head.py
ADDED
|
@@ -0,0 +1,212 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import asyncio
|
| 2 |
+
import logging
|
| 3 |
+
import os
|
| 4 |
+
import time
|
| 5 |
+
from collections import OrderedDict, defaultdict
|
| 6 |
+
from concurrent.futures import ThreadPoolExecutor
|
| 7 |
+
from datetime import datetime
|
| 8 |
+
from itertools import islice
|
| 9 |
+
from typing import Dict, Union
|
| 10 |
+
|
| 11 |
+
import aiohttp.web
|
| 12 |
+
|
| 13 |
+
import ray.dashboard.optional_utils as dashboard_optional_utils
|
| 14 |
+
import ray.dashboard.utils as dashboard_utils
|
| 15 |
+
from ray._private.ray_constants import env_integer
|
| 16 |
+
from ray._private.usage.usage_lib import TagKey, record_extra_usage_tag
|
| 17 |
+
from ray._private.utils import get_or_create_event_loop
|
| 18 |
+
from ray.core.generated import event_pb2, event_pb2_grpc
|
| 19 |
+
from ray.dashboard.consts import (
|
| 20 |
+
RAY_STATE_SERVER_MAX_HTTP_REQUEST,
|
| 21 |
+
RAY_STATE_SERVER_MAX_HTTP_REQUEST_ALLOWED,
|
| 22 |
+
RAY_STATE_SERVER_MAX_HTTP_REQUEST_ENV_NAME,
|
| 23 |
+
)
|
| 24 |
+
from ray.dashboard.modules.event.event_utils import monitor_events, parse_event_strings
|
| 25 |
+
from ray.dashboard.state_api_utils import do_filter, handle_list_api
|
| 26 |
+
from ray.util.state.common import ClusterEventState, ListApiOptions, ListApiResponse
|
| 27 |
+
|
| 28 |
+
logger = logging.getLogger(__name__)
|
| 29 |
+
routes = dashboard_optional_utils.DashboardHeadRouteTable
|
| 30 |
+
|
| 31 |
+
JobEvents = OrderedDict
|
| 32 |
+
dashboard_utils._json_compatible_types.add(JobEvents)
|
| 33 |
+
|
| 34 |
+
MAX_EVENTS_TO_CACHE = int(os.environ.get("RAY_DASHBOARD_MAX_EVENTS_TO_CACHE", 10000))
|
| 35 |
+
|
| 36 |
+
# NOTE: Executor in this head is intentionally constrained to just 1 thread by
|
| 37 |
+
# default to limit its concurrency, therefore reducing potential for
|
| 38 |
+
# GIL contention
|
| 39 |
+
RAY_DASHBOARD_EVENT_HEAD_TPE_MAX_WORKERS = env_integer(
|
| 40 |
+
"RAY_DASHBOARD_EVENT_HEAD_TPE_MAX_WORKERS", 1
|
| 41 |
+
)
|
| 42 |
+
|
| 43 |
+
|
| 44 |
+
async def _list_cluster_events_impl(
|
| 45 |
+
*, all_events, executor: ThreadPoolExecutor, option: ListApiOptions
|
| 46 |
+
) -> ListApiResponse:
|
| 47 |
+
"""
|
| 48 |
+
List all cluster events from the cluster. Made a free function to allow unit tests.
|
| 49 |
+
|
| 50 |
+
Returns:
|
| 51 |
+
A list of cluster events in the cluster.
|
| 52 |
+
The schema of returned "dict" is equivalent to the
|
| 53 |
+
`ClusterEventState` protobuf message.
|
| 54 |
+
"""
|
| 55 |
+
|
| 56 |
+
def transform(all_events) -> ListApiResponse:
|
| 57 |
+
result = []
|
| 58 |
+
for _, events in all_events.items():
|
| 59 |
+
for _, event in events.items():
|
| 60 |
+
event["time"] = str(datetime.fromtimestamp(int(event["timestamp"])))
|
| 61 |
+
result.append(event)
|
| 62 |
+
|
| 63 |
+
num_after_truncation = len(result)
|
| 64 |
+
result.sort(key=lambda entry: entry["timestamp"])
|
| 65 |
+
total = len(result)
|
| 66 |
+
result = do_filter(result, option.filters, ClusterEventState, option.detail)
|
| 67 |
+
num_filtered = len(result)
|
| 68 |
+
# Sort to make the output deterministic.
|
| 69 |
+
result = list(islice(result, option.limit))
|
| 70 |
+
return ListApiResponse(
|
| 71 |
+
result=result,
|
| 72 |
+
total=total,
|
| 73 |
+
num_after_truncation=num_after_truncation,
|
| 74 |
+
num_filtered=num_filtered,
|
| 75 |
+
)
|
| 76 |
+
|
| 77 |
+
return await get_or_create_event_loop().run_in_executor(
|
| 78 |
+
executor, transform, all_events
|
| 79 |
+
)
|
| 80 |
+
|
| 81 |
+
|
| 82 |
+
class EventHead(
|
| 83 |
+
dashboard_utils.DashboardHeadModule,
|
| 84 |
+
dashboard_utils.RateLimitedModule,
|
| 85 |
+
event_pb2_grpc.ReportEventServiceServicer,
|
| 86 |
+
):
|
| 87 |
+
def __init__(self, config: dashboard_utils.DashboardHeadModuleConfig):
|
| 88 |
+
dashboard_utils.DashboardHeadModule.__init__(self, config)
|
| 89 |
+
dashboard_utils.RateLimitedModule.__init__(
|
| 90 |
+
self,
|
| 91 |
+
min(
|
| 92 |
+
RAY_STATE_SERVER_MAX_HTTP_REQUEST,
|
| 93 |
+
RAY_STATE_SERVER_MAX_HTTP_REQUEST_ALLOWED,
|
| 94 |
+
),
|
| 95 |
+
)
|
| 96 |
+
self._event_dir = os.path.join(self.log_dir, "events")
|
| 97 |
+
os.makedirs(self._event_dir, exist_ok=True)
|
| 98 |
+
self._monitor: Union[asyncio.Task, None] = None
|
| 99 |
+
self.total_report_events_count = 0
|
| 100 |
+
self.total_events_received = 0
|
| 101 |
+
self.module_started = time.monotonic()
|
| 102 |
+
# {job_id hex(str): {event_id (str): event (dict)}}
|
| 103 |
+
self.events: Dict[str, JobEvents] = defaultdict(JobEvents)
|
| 104 |
+
|
| 105 |
+
self._executor = ThreadPoolExecutor(
|
| 106 |
+
max_workers=RAY_DASHBOARD_EVENT_HEAD_TPE_MAX_WORKERS,
|
| 107 |
+
thread_name_prefix="event_head_executor",
|
| 108 |
+
)
|
| 109 |
+
|
| 110 |
+
async def limit_handler_(self):
|
| 111 |
+
return dashboard_optional_utils.rest_response(
|
| 112 |
+
success=False,
|
| 113 |
+
error_message=(
|
| 114 |
+
"Max number of in-progress requests="
|
| 115 |
+
f"{self.max_num_call_} reached. "
|
| 116 |
+
"To set a higher limit, set environment variable: "
|
| 117 |
+
f"export {RAY_STATE_SERVER_MAX_HTTP_REQUEST_ENV_NAME}='xxx'. "
|
| 118 |
+
f"Max allowed = {RAY_STATE_SERVER_MAX_HTTP_REQUEST_ALLOWED}"
|
| 119 |
+
),
|
| 120 |
+
result=None,
|
| 121 |
+
)
|
| 122 |
+
|
| 123 |
+
def _update_events(self, event_list):
|
| 124 |
+
# {job_id: {event_id: event}}
|
| 125 |
+
all_job_events = defaultdict(JobEvents)
|
| 126 |
+
for event in event_list:
|
| 127 |
+
event_id = event["event_id"]
|
| 128 |
+
custom_fields = event.get("custom_fields")
|
| 129 |
+
system_event = False
|
| 130 |
+
if custom_fields:
|
| 131 |
+
job_id = custom_fields.get("job_id", "global") or "global"
|
| 132 |
+
else:
|
| 133 |
+
job_id = "global"
|
| 134 |
+
if system_event is False:
|
| 135 |
+
all_job_events[job_id][event_id] = event
|
| 136 |
+
|
| 137 |
+
for job_id, new_job_events in all_job_events.items():
|
| 138 |
+
job_events = self.events[job_id]
|
| 139 |
+
job_events.update(new_job_events)
|
| 140 |
+
|
| 141 |
+
# Limit the # of events cached if it exceeds the threshold.
|
| 142 |
+
if len(job_events) > MAX_EVENTS_TO_CACHE * 1.1:
|
| 143 |
+
while len(job_events) > MAX_EVENTS_TO_CACHE:
|
| 144 |
+
job_events.popitem(last=False)
|
| 145 |
+
|
| 146 |
+
async def ReportEvents(self, request, context):
|
| 147 |
+
received_events = []
|
| 148 |
+
if request.event_strings:
|
| 149 |
+
received_events.extend(parse_event_strings(request.event_strings))
|
| 150 |
+
logger.debug("Received %d events", len(received_events))
|
| 151 |
+
self._update_events(received_events)
|
| 152 |
+
self.total_report_events_count += 1
|
| 153 |
+
self.total_events_received += len(received_events)
|
| 154 |
+
return event_pb2.ReportEventsReply(send_success=True)
|
| 155 |
+
|
| 156 |
+
async def _periodic_state_print(self):
|
| 157 |
+
if self.total_events_received <= 0 or self.total_report_events_count <= 0:
|
| 158 |
+
return
|
| 159 |
+
|
| 160 |
+
elapsed = time.monotonic() - self.module_started
|
| 161 |
+
return {
|
| 162 |
+
"total_events_received": self.total_events_received,
|
| 163 |
+
"Total_requests_received": self.total_report_events_count,
|
| 164 |
+
"total_uptime": elapsed,
|
| 165 |
+
}
|
| 166 |
+
|
| 167 |
+
@routes.get("/events")
|
| 168 |
+
@dashboard_optional_utils.aiohttp_cache
|
| 169 |
+
async def get_event(self, req) -> aiohttp.web.Response:
|
| 170 |
+
job_id = req.query.get("job_id")
|
| 171 |
+
if job_id is None:
|
| 172 |
+
all_events = {
|
| 173 |
+
job_id: list(job_events.values())
|
| 174 |
+
for job_id, job_events in self.events.items()
|
| 175 |
+
}
|
| 176 |
+
return dashboard_optional_utils.rest_response(
|
| 177 |
+
success=True, message="All events fetched.", events=all_events
|
| 178 |
+
)
|
| 179 |
+
|
| 180 |
+
job_events = self.events[job_id]
|
| 181 |
+
return dashboard_optional_utils.rest_response(
|
| 182 |
+
success=True,
|
| 183 |
+
message="Job events fetched.",
|
| 184 |
+
job_id=job_id,
|
| 185 |
+
events=list(job_events.values()),
|
| 186 |
+
)
|
| 187 |
+
|
| 188 |
+
@routes.get("/api/v0/cluster_events")
|
| 189 |
+
@dashboard_utils.RateLimitedModule.enforce_max_concurrent_calls
|
| 190 |
+
async def list_cluster_events(
|
| 191 |
+
self, req: aiohttp.web.Request
|
| 192 |
+
) -> aiohttp.web.Response:
|
| 193 |
+
record_extra_usage_tag(TagKey.CORE_STATE_API_LIST_CLUSTER_EVENTS, "1")
|
| 194 |
+
|
| 195 |
+
async def list_api_fn(option: ListApiOptions):
|
| 196 |
+
return await _list_cluster_events_impl(
|
| 197 |
+
all_events=self.events, executor=self._executor, option=option
|
| 198 |
+
)
|
| 199 |
+
|
| 200 |
+
return await handle_list_api(list_api_fn, req)
|
| 201 |
+
|
| 202 |
+
async def run(self, server):
|
| 203 |
+
event_pb2_grpc.add_ReportEventServiceServicer_to_server(self, server)
|
| 204 |
+
self._monitor = monitor_events(
|
| 205 |
+
self._event_dir,
|
| 206 |
+
lambda data: self._update_events(parse_event_strings(data)),
|
| 207 |
+
self._executor,
|
| 208 |
+
)
|
| 209 |
+
|
| 210 |
+
@staticmethod
|
| 211 |
+
def is_minimal_module():
|
| 212 |
+
return False
|
.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/__init__.py
ADDED
|
File without changes
|
.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/__pycache__/__init__.cpython-311.pyc
ADDED
|
Binary file (202 Bytes). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/__pycache__/grafana_dashboard_factory.cpython-311.pyc
ADDED
|
Binary file (10.6 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/__pycache__/metrics_head.cpython-311.pyc
ADDED
|
Binary file (24.4 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/__pycache__/templates.cpython-311.pyc
ADDED
|
Binary file (1.59 kB). View file
|
|
|
.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/dashboards/common.py
ADDED
|
@@ -0,0 +1,70 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from dataclasses import dataclass
|
| 2 |
+
from typing import List, Optional
|
| 3 |
+
|
| 4 |
+
|
| 5 |
+
@dataclass
|
| 6 |
+
class GridPos:
|
| 7 |
+
x: int
|
| 8 |
+
y: int
|
| 9 |
+
w: int
|
| 10 |
+
h: int
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
@dataclass
|
| 14 |
+
class Target:
|
| 15 |
+
"""Defines a Grafana target (time-series query) within a panel.
|
| 16 |
+
|
| 17 |
+
A panel will have one or more targets. By default, all targets are rendered as
|
| 18 |
+
stacked area charts, with the exception of legend="MAX", which is rendered as
|
| 19 |
+
a blue dotted line. Any legend="FINISHED|FAILED|DEAD|REMOVED" series will also be
|
| 20 |
+
rendered hidden by default.
|
| 21 |
+
|
| 22 |
+
Attributes:
|
| 23 |
+
expr: The prometheus query to evaluate.
|
| 24 |
+
legend: The legend string to format for each time-series.
|
| 25 |
+
"""
|
| 26 |
+
|
| 27 |
+
expr: str
|
| 28 |
+
legend: str
|
| 29 |
+
|
| 30 |
+
|
| 31 |
+
@dataclass
|
| 32 |
+
class Panel:
|
| 33 |
+
"""Defines a Grafana panel (graph) for the Ray dashboard page.
|
| 34 |
+
|
| 35 |
+
A panel contains one or more targets (time-series queries).
|
| 36 |
+
|
| 37 |
+
Attributes:
|
| 38 |
+
title: Short name of the graph. Note: please keep this in sync with the title
|
| 39 |
+
definitions in Metrics.tsx.
|
| 40 |
+
description: Long form description of the graph.
|
| 41 |
+
id: Integer id used to reference the graph from Metrics.tsx.
|
| 42 |
+
unit: The unit to display on the y-axis of the graph.
|
| 43 |
+
targets: List of query targets.
|
| 44 |
+
fill: Whether or not the graph will be filled by a color.
|
| 45 |
+
stack: Whether or not the lines in the graph will be stacked.
|
| 46 |
+
"""
|
| 47 |
+
|
| 48 |
+
title: str
|
| 49 |
+
description: str
|
| 50 |
+
id: int
|
| 51 |
+
unit: str
|
| 52 |
+
targets: List[Target]
|
| 53 |
+
fill: int = 10
|
| 54 |
+
stack: bool = True
|
| 55 |
+
linewidth: int = 1
|
| 56 |
+
grid_pos: Optional[GridPos] = None
|
| 57 |
+
|
| 58 |
+
|
| 59 |
+
@dataclass
|
| 60 |
+
class DashboardConfig:
|
| 61 |
+
# This dashboard name is an internal key used to determine which env vars
|
| 62 |
+
# to check for customization
|
| 63 |
+
name: str
|
| 64 |
+
# The uid of the dashboard json if not overridden by a user
|
| 65 |
+
default_uid: str
|
| 66 |
+
panels: List[Panel]
|
| 67 |
+
# The global filters applied to all graphs in this dashboard. Users can
|
| 68 |
+
# add additional global_filters on top of this.
|
| 69 |
+
standard_global_filters: List[str]
|
| 70 |
+
base_json_file_name: str
|
.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/dashboards/data_dashboard_panels.py
ADDED
|
@@ -0,0 +1,551 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ruff: noqa: E501
|
| 2 |
+
|
| 3 |
+
from ray.dashboard.modules.metrics.dashboards.common import (
|
| 4 |
+
DashboardConfig,
|
| 5 |
+
Panel,
|
| 6 |
+
Target,
|
| 7 |
+
)
|
| 8 |
+
|
| 9 |
+
# When adding a new panels for an OpRuntimeMetric, follow this format:
|
| 10 |
+
# Panel(
|
| 11 |
+
# title=title,
|
| 12 |
+
# description=metric.metadata.get("description"),
|
| 13 |
+
# id=panel_id,
|
| 14 |
+
# unit=unit,
|
| 15 |
+
# targets=[
|
| 16 |
+
# Target(
|
| 17 |
+
# expr=f"sum(ray_data_{metric.name}"
|
| 18 |
+
# + "{{{global_filters}}}) by (dataset, operator)",
|
| 19 |
+
# legend=legend,
|
| 20 |
+
# )
|
| 21 |
+
# ],
|
| 22 |
+
# fill=fill,
|
| 23 |
+
# stack=stack,
|
| 24 |
+
# )
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
DATA_GRAFANA_PANELS = [
|
| 28 |
+
# Ray Data Metrics (Overview)
|
| 29 |
+
Panel(
|
| 30 |
+
id=1,
|
| 31 |
+
title="Bytes Spilled",
|
| 32 |
+
description="Amount spilled by dataset operators. DataContext.enable_get_object_locations_for_metrics must be set to True to report this metric",
|
| 33 |
+
unit="bytes",
|
| 34 |
+
targets=[
|
| 35 |
+
Target(
|
| 36 |
+
expr="sum(ray_data_spilled_bytes{{{global_filters}}}) by (dataset, operator)",
|
| 37 |
+
legend="Bytes Spilled: {{dataset}}, {{operator}}",
|
| 38 |
+
)
|
| 39 |
+
],
|
| 40 |
+
fill=0,
|
| 41 |
+
stack=False,
|
| 42 |
+
),
|
| 43 |
+
Panel(
|
| 44 |
+
id=2,
|
| 45 |
+
title="Bytes Allocated",
|
| 46 |
+
description="Amount allocated by dataset operators.",
|
| 47 |
+
unit="bytes",
|
| 48 |
+
targets=[
|
| 49 |
+
Target(
|
| 50 |
+
expr="sum(ray_data_allocated_bytes{{{global_filters}}}) by (dataset, operator)",
|
| 51 |
+
legend="Bytes Allocated: {{dataset}}, {{operator}}",
|
| 52 |
+
)
|
| 53 |
+
],
|
| 54 |
+
fill=0,
|
| 55 |
+
stack=False,
|
| 56 |
+
),
|
| 57 |
+
Panel(
|
| 58 |
+
id=3,
|
| 59 |
+
title="Bytes Freed",
|
| 60 |
+
description="Amount freed by dataset operators.",
|
| 61 |
+
unit="bytes",
|
| 62 |
+
targets=[
|
| 63 |
+
Target(
|
| 64 |
+
expr="sum(ray_data_freed_bytes{{{global_filters}}}) by (dataset, operator)",
|
| 65 |
+
legend="Bytes Freed: {{dataset}}, {{operator}}",
|
| 66 |
+
)
|
| 67 |
+
],
|
| 68 |
+
fill=0,
|
| 69 |
+
stack=False,
|
| 70 |
+
),
|
| 71 |
+
Panel(
|
| 72 |
+
id=4,
|
| 73 |
+
title="Object Store Memory",
|
| 74 |
+
description="Amount of memory store used by dataset operators.",
|
| 75 |
+
unit="bytes",
|
| 76 |
+
targets=[
|
| 77 |
+
Target(
|
| 78 |
+
expr="sum(ray_data_current_bytes{{{global_filters}}}) by (dataset, operator)",
|
| 79 |
+
legend="Current Usage: {{dataset}}, {{operator}}",
|
| 80 |
+
)
|
| 81 |
+
],
|
| 82 |
+
fill=0,
|
| 83 |
+
stack=False,
|
| 84 |
+
),
|
| 85 |
+
Panel(
|
| 86 |
+
id=5,
|
| 87 |
+
title="CPUs (logical slots)",
|
| 88 |
+
description="Logical CPUs allocated to dataset operators.",
|
| 89 |
+
unit="cores",
|
| 90 |
+
targets=[
|
| 91 |
+
Target(
|
| 92 |
+
expr="sum(ray_data_cpu_usage_cores{{{global_filters}}}) by (dataset, operator)",
|
| 93 |
+
legend="CPU Usage: {{dataset}}, {{operator}}",
|
| 94 |
+
)
|
| 95 |
+
],
|
| 96 |
+
fill=0,
|
| 97 |
+
stack=False,
|
| 98 |
+
),
|
| 99 |
+
Panel(
|
| 100 |
+
id=6,
|
| 101 |
+
title="GPUs (logical slots)",
|
| 102 |
+
description="Logical GPUs allocated to dataset operators.",
|
| 103 |
+
unit="cores",
|
| 104 |
+
targets=[
|
| 105 |
+
Target(
|
| 106 |
+
expr="sum(ray_data_gpu_usage_cores{{{global_filters}}}) by (dataset, operator)",
|
| 107 |
+
legend="GPU Usage: {{dataset}}, {{operator}}",
|
| 108 |
+
)
|
| 109 |
+
],
|
| 110 |
+
fill=0,
|
| 111 |
+
stack=False,
|
| 112 |
+
),
|
| 113 |
+
Panel(
|
| 114 |
+
id=7,
|
| 115 |
+
title="Bytes Output / Second",
|
| 116 |
+
description="Bytes output per second by dataset operators.",
|
| 117 |
+
unit="Bps",
|
| 118 |
+
targets=[
|
| 119 |
+
Target(
|
| 120 |
+
expr="sum(rate(ray_data_output_bytes{{{global_filters}}}[1m])) by (dataset, operator)",
|
| 121 |
+
legend="Bytes Output / Second: {{dataset}}, {{operator}}",
|
| 122 |
+
)
|
| 123 |
+
],
|
| 124 |
+
fill=0,
|
| 125 |
+
stack=False,
|
| 126 |
+
),
|
| 127 |
+
Panel(
|
| 128 |
+
id=11,
|
| 129 |
+
title="Rows Output / Second",
|
| 130 |
+
description="Total rows output per second by dataset operators.",
|
| 131 |
+
unit="rows/sec",
|
| 132 |
+
targets=[
|
| 133 |
+
Target(
|
| 134 |
+
expr="sum(rate(ray_data_output_rows{{{global_filters}}}[1m])) by (dataset, operator)",
|
| 135 |
+
legend="Rows Output / Second: {{dataset}}, {{operator}}",
|
| 136 |
+
)
|
| 137 |
+
],
|
| 138 |
+
fill=0,
|
| 139 |
+
stack=False,
|
| 140 |
+
),
|
| 141 |
+
# Ray Data Metrics (Inputs)
|
| 142 |
+
Panel(
|
| 143 |
+
id=17,
|
| 144 |
+
title="Input Blocks Received by Operator / Second",
|
| 145 |
+
description="Number of input blocks received by operator per second.",
|
| 146 |
+
unit="blocks/sec",
|
| 147 |
+
targets=[
|
| 148 |
+
Target(
|
| 149 |
+
expr="sum(rate(ray_data_num_inputs_received{{{global_filters}}}[1m])) by (dataset, operator)",
|
| 150 |
+
legend="Blocks Received / Second: {{dataset}}, {{operator}}",
|
| 151 |
+
)
|
| 152 |
+
],
|
| 153 |
+
fill=0,
|
| 154 |
+
stack=False,
|
| 155 |
+
),
|
| 156 |
+
Panel(
|
| 157 |
+
id=18,
|
| 158 |
+
title="Input Bytes Received by Operator / Second",
|
| 159 |
+
description="Byte size of input blocks received by operator per second.",
|
| 160 |
+
unit="Bps",
|
| 161 |
+
targets=[
|
| 162 |
+
Target(
|
| 163 |
+
expr="sum(rate(ray_data_bytes_inputs_received{{{global_filters}}}[1m])) by (dataset, operator)",
|
| 164 |
+
legend="Bytes Received / Second: {{dataset}}, {{operator}}",
|
| 165 |
+
)
|
| 166 |
+
],
|
| 167 |
+
fill=0,
|
| 168 |
+
stack=False,
|
| 169 |
+
),
|
| 170 |
+
Panel(
|
| 171 |
+
id=19,
|
| 172 |
+
title="Input Blocks Processed by Tasks / Second",
|
| 173 |
+
description=(
|
| 174 |
+
"Number of input blocks that operator's tasks have finished processing per second."
|
| 175 |
+
),
|
| 176 |
+
unit="blocks/sec",
|
| 177 |
+
targets=[
|
| 178 |
+
Target(
|
| 179 |
+
expr="sum(rate(ray_data_num_task_inputs_processed{{{global_filters}}}[1m])) by (dataset, operator)",
|
| 180 |
+
legend="Blocks Processed / Second: {{dataset}}, {{operator}}",
|
| 181 |
+
)
|
| 182 |
+
],
|
| 183 |
+
fill=0,
|
| 184 |
+
stack=False,
|
| 185 |
+
),
|
| 186 |
+
Panel(
|
| 187 |
+
id=20,
|
| 188 |
+
title="Input Bytes Processed by Tasks / Second",
|
| 189 |
+
description=(
|
| 190 |
+
"Byte size of input blocks that operator's tasks have finished processing per second."
|
| 191 |
+
),
|
| 192 |
+
unit="Bps",
|
| 193 |
+
targets=[
|
| 194 |
+
Target(
|
| 195 |
+
expr="sum(rate(ray_data_bytes_task_inputs_processed{{{global_filters}}}[1m])) by (dataset, operator)",
|
| 196 |
+
legend="Bytes Processed / Second: {{dataset}}, {{operator}}",
|
| 197 |
+
)
|
| 198 |
+
],
|
| 199 |
+
fill=0,
|
| 200 |
+
stack=False,
|
| 201 |
+
),
|
| 202 |
+
Panel(
|
| 203 |
+
id=21,
|
| 204 |
+
title="Input Bytes Submitted to Tasks / Second",
|
| 205 |
+
description="Byte size of input blocks passed to submitted tasks per second.",
|
| 206 |
+
unit="Bps",
|
| 207 |
+
targets=[
|
| 208 |
+
Target(
|
| 209 |
+
expr="sum(rate(ray_data_bytes_inputs_of_submitted_tasks{{{global_filters}}}[1m])) by (dataset, operator)",
|
| 210 |
+
legend="Bytes Submitted / Second: {{dataset}}, {{operator}}",
|
| 211 |
+
)
|
| 212 |
+
],
|
| 213 |
+
fill=0,
|
| 214 |
+
stack=False,
|
| 215 |
+
),
|
| 216 |
+
Panel(
|
| 217 |
+
id=22,
|
| 218 |
+
title="Blocks Generated by Tasks / Second",
|
| 219 |
+
description="Number of output blocks generated by tasks per second.",
|
| 220 |
+
unit="blocks/sec",
|
| 221 |
+
targets=[
|
| 222 |
+
Target(
|
| 223 |
+
expr="sum(rate(ray_data_num_task_outputs_generated{{{global_filters}}}[1m])) by (dataset, operator)",
|
| 224 |
+
legend="Blocks Generated / Second: {{dataset}}, {{operator}}",
|
| 225 |
+
)
|
| 226 |
+
],
|
| 227 |
+
fill=0,
|
| 228 |
+
stack=False,
|
| 229 |
+
),
|
| 230 |
+
Panel(
|
| 231 |
+
id=23,
|
| 232 |
+
title="Bytes Generated by Tasks / Second",
|
| 233 |
+
description="Byte size of output blocks generated by tasks per second.",
|
| 234 |
+
unit="Bps",
|
| 235 |
+
targets=[
|
| 236 |
+
Target(
|
| 237 |
+
expr="sum(rate(ray_data_bytes_task_outputs_generated{{{global_filters}}}[1m])) by (dataset, operator)",
|
| 238 |
+
legend="Bytes Generated / Second: {{dataset}}, {{operator}}",
|
| 239 |
+
)
|
| 240 |
+
],
|
| 241 |
+
fill=0,
|
| 242 |
+
stack=False,
|
| 243 |
+
),
|
| 244 |
+
Panel(
|
| 245 |
+
id=24,
|
| 246 |
+
title="Rows Generated by Tasks / Second",
|
| 247 |
+
description="Number of rows in generated output blocks from finished tasks per second.",
|
| 248 |
+
unit="rows/sec",
|
| 249 |
+
targets=[
|
| 250 |
+
Target(
|
| 251 |
+
expr="sum(rate(ray_data_rows_task_outputs_generated{{{global_filters}}}[1m])) by (dataset, operator)",
|
| 252 |
+
legend="Rows Generated / Second: {{dataset}}, {{operator}}",
|
| 253 |
+
)
|
| 254 |
+
],
|
| 255 |
+
fill=0,
|
| 256 |
+
stack=False,
|
| 257 |
+
),
|
| 258 |
+
Panel(
|
| 259 |
+
id=25,
|
| 260 |
+
title="Output Blocks Taken by Downstream Operators / Second",
|
| 261 |
+
description="Number of output blocks taken by downstream operators per second.",
|
| 262 |
+
unit="blocks/sec",
|
| 263 |
+
targets=[
|
| 264 |
+
Target(
|
| 265 |
+
expr="sum(rate(ray_data_num_outputs_taken{{{global_filters}}}[1m])) by (dataset, operator)",
|
| 266 |
+
legend="Blocks Taken / Second: {{dataset}}, {{operator}}",
|
| 267 |
+
)
|
| 268 |
+
],
|
| 269 |
+
fill=0,
|
| 270 |
+
stack=False,
|
| 271 |
+
),
|
| 272 |
+
Panel(
|
| 273 |
+
id=26,
|
| 274 |
+
title="Output Bytes Taken by Downstream Operators / Second",
|
| 275 |
+
description=(
|
| 276 |
+
"Byte size of output blocks taken by downstream operators per second."
|
| 277 |
+
),
|
| 278 |
+
unit="Bps",
|
| 279 |
+
targets=[
|
| 280 |
+
Target(
|
| 281 |
+
expr="sum(rate(ray_data_bytes_outputs_taken{{{global_filters}}}[1m])) by (dataset, operator)",
|
| 282 |
+
legend="Bytes Taken / Second: {{dataset}}, {{operator}}",
|
| 283 |
+
)
|
| 284 |
+
],
|
| 285 |
+
fill=0,
|
| 286 |
+
stack=False,
|
| 287 |
+
),
|
| 288 |
+
# Ray Data Metrics (Tasks)
|
| 289 |
+
Panel(
|
| 290 |
+
id=29,
|
| 291 |
+
title="Submitted Tasks",
|
| 292 |
+
description="Number of submitted tasks.",
|
| 293 |
+
unit="tasks",
|
| 294 |
+
targets=[
|
| 295 |
+
Target(
|
| 296 |
+
expr="sum(ray_data_num_tasks_submitted{{{global_filters}}}) by (dataset, operator)",
|
| 297 |
+
legend="Submitted Tasks: {{dataset}}, {{operator}}",
|
| 298 |
+
)
|
| 299 |
+
],
|
| 300 |
+
fill=0,
|
| 301 |
+
stack=False,
|
| 302 |
+
),
|
| 303 |
+
Panel(
|
| 304 |
+
id=30,
|
| 305 |
+
title="Running Tasks",
|
| 306 |
+
description="Number of running tasks.",
|
| 307 |
+
unit="tasks",
|
| 308 |
+
targets=[
|
| 309 |
+
Target(
|
| 310 |
+
expr="sum(ray_data_num_tasks_running{{{global_filters}}}) by (dataset, operator)",
|
| 311 |
+
legend="Running Tasks: {{dataset}}, {{operator}}",
|
| 312 |
+
)
|
| 313 |
+
],
|
| 314 |
+
fill=0,
|
| 315 |
+
stack=False,
|
| 316 |
+
),
|
| 317 |
+
Panel(
|
| 318 |
+
id=31,
|
| 319 |
+
title="Tasks with output blocks",
|
| 320 |
+
description="Number of tasks that already have output.",
|
| 321 |
+
unit="tasks",
|
| 322 |
+
targets=[
|
| 323 |
+
Target(
|
| 324 |
+
expr="sum(ray_data_num_tasks_have_outputs{{{global_filters}}}) by (dataset, operator)",
|
| 325 |
+
legend="Tasks with output blocks: {{dataset}}, {{operator}}",
|
| 326 |
+
)
|
| 327 |
+
],
|
| 328 |
+
fill=0,
|
| 329 |
+
stack=False,
|
| 330 |
+
),
|
| 331 |
+
Panel(
|
| 332 |
+
id=32,
|
| 333 |
+
title="Finished Tasks",
|
| 334 |
+
description="Number of finished tasks.",
|
| 335 |
+
unit="tasks",
|
| 336 |
+
targets=[
|
| 337 |
+
Target(
|
| 338 |
+
expr="sum(ray_data_num_tasks_finished{{{global_filters}}}) by (dataset, operator)",
|
| 339 |
+
legend="Finished Tasks: {{dataset}}, {{operator}}",
|
| 340 |
+
)
|
| 341 |
+
],
|
| 342 |
+
fill=0,
|
| 343 |
+
stack=False,
|
| 344 |
+
),
|
| 345 |
+
Panel(
|
| 346 |
+
id=33,
|
| 347 |
+
title="Failed Tasks",
|
| 348 |
+
description="Number of failed tasks.",
|
| 349 |
+
unit="tasks",
|
| 350 |
+
targets=[
|
| 351 |
+
Target(
|
| 352 |
+
expr="sum(ray_data_num_tasks_failed{{{global_filters}}}) by (dataset, operator)",
|
| 353 |
+
legend="Failed Tasks: {{dataset}}, {{operator}}",
|
| 354 |
+
)
|
| 355 |
+
],
|
| 356 |
+
fill=0,
|
| 357 |
+
stack=False,
|
| 358 |
+
),
|
| 359 |
+
Panel(
|
| 360 |
+
id=8,
|
| 361 |
+
title="Block Generation Time",
|
| 362 |
+
description="Time spent generating blocks in tasks.",
|
| 363 |
+
unit="seconds",
|
| 364 |
+
targets=[
|
| 365 |
+
Target(
|
| 366 |
+
expr="sum(ray_data_block_generation_time{{{global_filters}}}) by (dataset, operator)",
|
| 367 |
+
legend="Block Generation Time: {{dataset}}, {{operator}}",
|
| 368 |
+
)
|
| 369 |
+
],
|
| 370 |
+
fill=0,
|
| 371 |
+
stack=False,
|
| 372 |
+
),
|
| 373 |
+
Panel(
|
| 374 |
+
id=37,
|
| 375 |
+
title="Task Submission Backpressure Time",
|
| 376 |
+
description="Time spent in task submission backpressure.",
|
| 377 |
+
unit="seconds",
|
| 378 |
+
targets=[
|
| 379 |
+
Target(
|
| 380 |
+
expr="sum(ray_data_task_submission_backpressure_time{{{global_filters}}}) by (dataset, operator)",
|
| 381 |
+
legend="Backpressure Time: {{dataset}}, {{operator}}",
|
| 382 |
+
)
|
| 383 |
+
],
|
| 384 |
+
fill=0,
|
| 385 |
+
stack=True,
|
| 386 |
+
),
|
| 387 |
+
# Ray Data Metrics (Object Store Memory)
|
| 388 |
+
Panel(
|
| 389 |
+
id=13,
|
| 390 |
+
title="Operator Internal Inqueue Size (Blocks)",
|
| 391 |
+
description="Number of blocks in operator's internal input queue",
|
| 392 |
+
unit="blocks",
|
| 393 |
+
targets=[
|
| 394 |
+
Target(
|
| 395 |
+
expr="sum(ray_data_obj_store_mem_internal_inqueue_blocks{{{global_filters}}}) by (dataset, operator)",
|
| 396 |
+
legend="Number of Blocks: {{dataset}}, {{operator}}",
|
| 397 |
+
)
|
| 398 |
+
],
|
| 399 |
+
fill=0,
|
| 400 |
+
stack=False,
|
| 401 |
+
),
|
| 402 |
+
Panel(
|
| 403 |
+
id=14,
|
| 404 |
+
title="Operator Internal Inqueue Size (Bytes)",
|
| 405 |
+
description="Byte size of input blocks in the operator's internal input queue.",
|
| 406 |
+
unit="bytes",
|
| 407 |
+
targets=[
|
| 408 |
+
Target(
|
| 409 |
+
expr="sum(ray_data_obj_store_mem_internal_inqueue{{{global_filters}}}) by (dataset, operator)",
|
| 410 |
+
legend="Bytes Size: {{dataset}}, {{operator}}",
|
| 411 |
+
)
|
| 412 |
+
],
|
| 413 |
+
fill=0,
|
| 414 |
+
stack=True,
|
| 415 |
+
),
|
| 416 |
+
Panel(
|
| 417 |
+
id=15,
|
| 418 |
+
title="Operator Internal Outqueue Size (Blocks)",
|
| 419 |
+
description="Number of blocks in operator's internal output queue",
|
| 420 |
+
unit="blocks",
|
| 421 |
+
targets=[
|
| 422 |
+
Target(
|
| 423 |
+
expr="sum(ray_data_obj_store_mem_internal_outqueue_blocks{{{global_filters}}}) by (dataset, operator)",
|
| 424 |
+
legend="Number of Blocks: {{dataset}}, {{operator}}",
|
| 425 |
+
)
|
| 426 |
+
],
|
| 427 |
+
fill=0,
|
| 428 |
+
stack=False,
|
| 429 |
+
),
|
| 430 |
+
Panel(
|
| 431 |
+
id=16,
|
| 432 |
+
title="Operator Internal Outqueue Size (Bytes)",
|
| 433 |
+
description=(
|
| 434 |
+
"Byte size of output blocks in the operator's internal output queue."
|
| 435 |
+
),
|
| 436 |
+
unit="bytes",
|
| 437 |
+
targets=[
|
| 438 |
+
Target(
|
| 439 |
+
expr="sum(ray_data_obj_store_mem_internal_outqueue{{{global_filters}}}) by (dataset, operator)",
|
| 440 |
+
legend="Bytes Size: {{dataset}}, {{operator}}",
|
| 441 |
+
)
|
| 442 |
+
],
|
| 443 |
+
fill=0,
|
| 444 |
+
stack=True,
|
| 445 |
+
),
|
| 446 |
+
Panel(
|
| 447 |
+
id=34,
|
| 448 |
+
title="Size of Blocks used in Pending Tasks (Bytes)",
|
| 449 |
+
description="Byte size of input blocks used by pending tasks.",
|
| 450 |
+
unit="bytes",
|
| 451 |
+
targets=[
|
| 452 |
+
Target(
|
| 453 |
+
expr="sum(ray_data_obj_store_mem_pending_task_inputs{{{global_filters}}}) by (dataset, operator)",
|
| 454 |
+
legend="Bytes Size: {{dataset}}, {{operator}}",
|
| 455 |
+
)
|
| 456 |
+
],
|
| 457 |
+
fill=0,
|
| 458 |
+
stack=True,
|
| 459 |
+
),
|
| 460 |
+
Panel(
|
| 461 |
+
id=35,
|
| 462 |
+
title="Freed Memory in Object Store (Bytes)",
|
| 463 |
+
description="Byte size of freed memory in object store.",
|
| 464 |
+
unit="bytes",
|
| 465 |
+
targets=[
|
| 466 |
+
Target(
|
| 467 |
+
expr="sum(ray_data_obj_store_mem_freed{{{global_filters}}}) by (dataset, operator)",
|
| 468 |
+
legend="Bytes Size: {{dataset}}, {{operator}}",
|
| 469 |
+
)
|
| 470 |
+
],
|
| 471 |
+
fill=0,
|
| 472 |
+
stack=True,
|
| 473 |
+
),
|
| 474 |
+
Panel(
|
| 475 |
+
id=36,
|
| 476 |
+
title="Spilled Memory in Object Store (Bytes)",
|
| 477 |
+
description="Byte size of spilled memory in object store.",
|
| 478 |
+
unit="bytes",
|
| 479 |
+
targets=[
|
| 480 |
+
Target(
|
| 481 |
+
expr="sum(ray_data_obj_store_mem_spilled{{{global_filters}}}) by (dataset, operator)",
|
| 482 |
+
legend="Bytes Size: {{dataset}}, {{operator}}",
|
| 483 |
+
)
|
| 484 |
+
],
|
| 485 |
+
fill=0,
|
| 486 |
+
stack=True,
|
| 487 |
+
),
|
| 488 |
+
# Ray Data Metrics (Iteration)
|
| 489 |
+
Panel(
|
| 490 |
+
id=12,
|
| 491 |
+
title="Iteration Initialization Time",
|
| 492 |
+
description="Seconds spent in iterator initialization code",
|
| 493 |
+
unit="seconds",
|
| 494 |
+
targets=[
|
| 495 |
+
Target(
|
| 496 |
+
expr="sum(ray_data_iter_initialize_seconds{{{global_filters}}}) by (dataset)",
|
| 497 |
+
legend="Seconds: {{dataset}}, {{operator}}",
|
| 498 |
+
)
|
| 499 |
+
],
|
| 500 |
+
fill=0,
|
| 501 |
+
stack=False,
|
| 502 |
+
),
|
| 503 |
+
Panel(
|
| 504 |
+
id=9,
|
| 505 |
+
title="Iteration Blocked Time",
|
| 506 |
+
description="Seconds user thread is blocked by iter_batches()",
|
| 507 |
+
unit="seconds",
|
| 508 |
+
targets=[
|
| 509 |
+
Target(
|
| 510 |
+
expr="sum(ray_data_iter_total_blocked_seconds{{{global_filters}}}) by (dataset)",
|
| 511 |
+
legend="Seconds: {{dataset}}",
|
| 512 |
+
)
|
| 513 |
+
],
|
| 514 |
+
fill=0,
|
| 515 |
+
stack=False,
|
| 516 |
+
),
|
| 517 |
+
Panel(
|
| 518 |
+
id=10,
|
| 519 |
+
title="Iteration User Time",
|
| 520 |
+
description="Seconds spent in user code",
|
| 521 |
+
unit="seconds",
|
| 522 |
+
targets=[
|
| 523 |
+
Target(
|
| 524 |
+
expr="sum(ray_data_iter_user_seconds{{{global_filters}}}) by (dataset)",
|
| 525 |
+
legend="Seconds: {{dataset}}",
|
| 526 |
+
)
|
| 527 |
+
],
|
| 528 |
+
fill=0,
|
| 529 |
+
stack=False,
|
| 530 |
+
),
|
| 531 |
+
# Ray Data Metrics (Miscellaneous)
|
| 532 |
+
]
|
| 533 |
+
|
| 534 |
+
ids = []
|
| 535 |
+
for panel in DATA_GRAFANA_PANELS:
|
| 536 |
+
ids.append(panel.id)
|
| 537 |
+
assert len(ids) == len(
|
| 538 |
+
set(ids)
|
| 539 |
+
), f"Duplicated id found. Use unique id for each panel. {ids}"
|
| 540 |
+
|
| 541 |
+
data_dashboard_config = DashboardConfig(
|
| 542 |
+
name="DATA",
|
| 543 |
+
default_uid="rayDataDashboard",
|
| 544 |
+
panels=DATA_GRAFANA_PANELS,
|
| 545 |
+
standard_global_filters=[
|
| 546 |
+
'dataset=~"$DatasetID"',
|
| 547 |
+
'SessionName=~"$SessionName"',
|
| 548 |
+
'ray_io_cluster=~"$Cluster"',
|
| 549 |
+
],
|
| 550 |
+
base_json_file_name="data_grafana_dashboard_base.json",
|
| 551 |
+
)
|
.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/dashboards/default_dashboard_panels.py
ADDED
|
@@ -0,0 +1,478 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ruff: noqa: E501
|
| 2 |
+
|
| 3 |
+
from ray.dashboard.modules.metrics.dashboards.common import (
|
| 4 |
+
DashboardConfig,
|
| 5 |
+
Panel,
|
| 6 |
+
Target,
|
| 7 |
+
)
|
| 8 |
+
|
| 9 |
+
"""
|
| 10 |
+
Queries for autoscaler resources.
|
| 11 |
+
"""
|
| 12 |
+
# Note: MAX & USED resources are reported from raylet to provide the most up to date information.
|
| 13 |
+
# But MAX + PENDING data is coming from the autoscaler. That said, MAX + PENDING can be
|
| 14 |
+
# more outdated. it is harmless because the actual MAX will catch up with MAX + PENDING
|
| 15 |
+
# eventually.
|
| 16 |
+
MAX_CPUS = 'sum(autoscaler_cluster_resources{{resource="CPU",{global_filters}}})'
|
| 17 |
+
PENDING_CPUS = 'sum(autoscaler_pending_resources{{resource="CPU",{global_filters}}})'
|
| 18 |
+
MAX_GPUS = 'sum(autoscaler_cluster_resources{{resource="GPU",{global_filters}}})'
|
| 19 |
+
PENDING_GPUS = 'sum(autoscaler_pending_resources{{resource="GPU",{global_filters}}})'
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def max_plus_pending(max_resource, pending_resource):
|
| 23 |
+
return f"({max_resource} or vector(0)) + ({pending_resource} or vector(0))"
|
| 24 |
+
|
| 25 |
+
|
| 26 |
+
MAX_PLUS_PENDING_CPUS = max_plus_pending(MAX_CPUS, PENDING_CPUS)
|
| 27 |
+
MAX_PLUS_PENDING_GPUS = max_plus_pending(MAX_GPUS, PENDING_GPUS)
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
| 31 |
+
# IMPORTANT: Please keep this in sync with Metrics.tsx and ray-metrics.rst
|
| 32 |
+
# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
|
| 33 |
+
DEFAULT_GRAFANA_PANELS = [
|
| 34 |
+
Panel(
|
| 35 |
+
id=26,
|
| 36 |
+
title="Scheduler Task State",
|
| 37 |
+
description="Current number of tasks in a particular state.\n\nState: the task state, as described by rpc::TaskState proto in common.proto. Task resubmissions due to failures or object reconstruction are shown with (retry) in the label.",
|
| 38 |
+
unit="tasks",
|
| 39 |
+
targets=[
|
| 40 |
+
Target(
|
| 41 |
+
expr='sum(max_over_time(ray_tasks{{IsRetry="0",State=~"FINISHED|FAILED",instance=~"$Instance",{global_filters}}}[14d])) by (State) or clamp_min(sum(ray_tasks{{IsRetry="0",State!~"FINISHED|FAILED",instance=~"$Instance",{global_filters}}}) by (State), 0)',
|
| 42 |
+
legend="{{State}}",
|
| 43 |
+
),
|
| 44 |
+
Target(
|
| 45 |
+
expr='sum(max_over_time(ray_tasks{{IsRetry!="0",State=~"FINISHED|FAILED",instance=~"$Instance",{global_filters}}}[14d])) by (State) or clamp_min(sum(ray_tasks{{IsRetry!="0",State!~"FINISHED|FAILED",instance=~"$Instance",{global_filters}}}) by (State), 0)',
|
| 46 |
+
legend="{{State}} (retry)",
|
| 47 |
+
),
|
| 48 |
+
],
|
| 49 |
+
fill=0,
|
| 50 |
+
stack=False,
|
| 51 |
+
),
|
| 52 |
+
Panel(
|
| 53 |
+
id=35,
|
| 54 |
+
title="Requested Live Tasks by Name",
|
| 55 |
+
description="Current number of (live) tasks with a particular name. Task resubmissions due to failures or object reconstruction are shown with (retry) in the label.",
|
| 56 |
+
unit="tasks",
|
| 57 |
+
targets=[
|
| 58 |
+
Target(
|
| 59 |
+
expr='clamp_min(sum(ray_tasks{{IsRetry="0",State!~"FINISHED|FAILED",instance=~"$Instance",{global_filters}}}) by (Name), 0)',
|
| 60 |
+
legend="{{Name}}",
|
| 61 |
+
),
|
| 62 |
+
Target(
|
| 63 |
+
expr='clamp_min(sum(ray_tasks{{IsRetry!="0",State!~"FINISHED|FAILED",instance=~"$Instance",{global_filters}}}) by (Name), 0)',
|
| 64 |
+
legend="{{Name}} (retry)",
|
| 65 |
+
),
|
| 66 |
+
],
|
| 67 |
+
fill=0,
|
| 68 |
+
stack=False,
|
| 69 |
+
),
|
| 70 |
+
Panel(
|
| 71 |
+
id=38,
|
| 72 |
+
title="Running Tasks by Name",
|
| 73 |
+
description="Current number of (running) tasks with a particular name. Task resubmissions due to failures or object reconstruction are shown with (retry) in the label.",
|
| 74 |
+
unit="tasks",
|
| 75 |
+
targets=[
|
| 76 |
+
Target(
|
| 77 |
+
expr='clamp_min(sum(ray_tasks{{IsRetry="0",State=~"RUNNING*",instance=~"$Instance",{global_filters}}}) by (Name), 0)',
|
| 78 |
+
legend="{{Name}}",
|
| 79 |
+
),
|
| 80 |
+
Target(
|
| 81 |
+
expr='clamp_min(sum(ray_tasks{{IsRetry!="0",State=~"RUNNING*",instance=~"$Instance",{global_filters}}}) by (Name), 0)',
|
| 82 |
+
legend="{{Name}} (retry)",
|
| 83 |
+
),
|
| 84 |
+
],
|
| 85 |
+
fill=0,
|
| 86 |
+
stack=False,
|
| 87 |
+
),
|
| 88 |
+
Panel(
|
| 89 |
+
id=33,
|
| 90 |
+
title="Scheduler Actor State",
|
| 91 |
+
description='Note: not impacted by "Instance" variable.\n\nCurrent number of actors in a particular state.\n\nState: the actor state, as described by rpc::ActorTableData proto in gcs.proto.',
|
| 92 |
+
unit="actors",
|
| 93 |
+
targets=[
|
| 94 |
+
Target(
|
| 95 |
+
expr='sum(ray_actors{{Source="gcs",{global_filters}}}) by (State)',
|
| 96 |
+
legend="{{State}}",
|
| 97 |
+
)
|
| 98 |
+
],
|
| 99 |
+
),
|
| 100 |
+
Panel(
|
| 101 |
+
id=42,
|
| 102 |
+
title="Live Actor State",
|
| 103 |
+
description="Current number of alive actors in a particular state.\n\nState: IDLE, RUNNING_TASK, RUNNING_IN_RAY_GET, RUNNING_IN_RAY_WAIT",
|
| 104 |
+
unit="actors",
|
| 105 |
+
targets=[
|
| 106 |
+
Target(
|
| 107 |
+
expr='sum(ray_actors{{Source="executor",NodeAddress=~"$Instance",{global_filters}}}) by (State)',
|
| 108 |
+
legend="{{State}}",
|
| 109 |
+
)
|
| 110 |
+
],
|
| 111 |
+
),
|
| 112 |
+
Panel(
|
| 113 |
+
id=36,
|
| 114 |
+
title="Live Actors by Name",
|
| 115 |
+
description="Current number of alive actors with a particular name.",
|
| 116 |
+
unit="actors",
|
| 117 |
+
targets=[
|
| 118 |
+
Target(
|
| 119 |
+
expr='sum(ray_actors{{State!="DEAD",Source="executor",NodeAddress=~"$Instance",{global_filters}}}) by (Name)',
|
| 120 |
+
legend="{{Name}}",
|
| 121 |
+
)
|
| 122 |
+
],
|
| 123 |
+
),
|
| 124 |
+
Panel(
|
| 125 |
+
id=27,
|
| 126 |
+
title="Scheduler CPUs (logical slots)",
|
| 127 |
+
description="Logical CPU usage of Ray. The dotted line indicates the total number of CPUs. The logical CPU is allocated by `num_cpus` arguments from tasks and actors. PENDING means the number of CPUs that will be available when new nodes are up after the autoscaler scales up.\n\nNOTE: Ray's logical CPU is different from physical CPU usage. Ray's logical CPU is allocated by `num_cpus` arguments.",
|
| 128 |
+
unit="cores",
|
| 129 |
+
targets=[
|
| 130 |
+
Target(
|
| 131 |
+
expr='sum(ray_resources{{Name="CPU",State="USED",instance=~"$Instance",{global_filters}}}) by (instance)',
|
| 132 |
+
legend="CPU Usage: {{instance}}",
|
| 133 |
+
),
|
| 134 |
+
Target(
|
| 135 |
+
expr='sum(ray_resources{{Name="CPU",instance=~"$Instance",{global_filters}}})',
|
| 136 |
+
legend="MAX",
|
| 137 |
+
),
|
| 138 |
+
# If max + pending > max, we display this value.
|
| 139 |
+
# (A and predicate) means to return A when the predicate satisfies in PromSql.
|
| 140 |
+
Target(
|
| 141 |
+
expr=f"({MAX_PLUS_PENDING_CPUS} and {MAX_PLUS_PENDING_CPUS} > ({MAX_CPUS} or vector(0)))",
|
| 142 |
+
legend="MAX + PENDING",
|
| 143 |
+
),
|
| 144 |
+
],
|
| 145 |
+
),
|
| 146 |
+
Panel(
|
| 147 |
+
id=29,
|
| 148 |
+
title="Object Store Memory",
|
| 149 |
+
description="Object store memory usage by location. The dotted line indicates the object store memory capacity.\n\nLocation: where the memory was allocated, which is MMAP_SHM or MMAP_DISK to indicate memory-mapped page, SPILLED to indicate spillage to disk, and WORKER_HEAP for objects small enough to be inlined in worker memory. Refer to metric_defs.cc for more information.",
|
| 150 |
+
unit="bytes",
|
| 151 |
+
targets=[
|
| 152 |
+
Target(
|
| 153 |
+
expr='sum(ray_object_store_memory{{instance=~"$Instance",{global_filters}}}) by (Location)',
|
| 154 |
+
legend="{{Location}}",
|
| 155 |
+
),
|
| 156 |
+
Target(
|
| 157 |
+
expr='sum(ray_resources{{Name="object_store_memory",instance=~"$Instance",{global_filters}}})',
|
| 158 |
+
legend="MAX",
|
| 159 |
+
),
|
| 160 |
+
],
|
| 161 |
+
),
|
| 162 |
+
Panel(
|
| 163 |
+
id=28,
|
| 164 |
+
title="Scheduler GPUs (logical slots)",
|
| 165 |
+
description="Logical GPU usage of Ray. The dotted line indicates the total number of GPUs. The logical GPU is allocated by `num_gpus` arguments from tasks and actors. PENDING means the number of GPUs that will be available when new nodes are up after the autoscaler scales up.",
|
| 166 |
+
unit="GPUs",
|
| 167 |
+
targets=[
|
| 168 |
+
Target(
|
| 169 |
+
expr='ray_resources{{Name="GPU",State="USED",instance=~"$Instance",{global_filters}}}',
|
| 170 |
+
legend="GPU Usage: {{instance}}",
|
| 171 |
+
),
|
| 172 |
+
Target(
|
| 173 |
+
expr='sum(ray_resources{{Name="GPU",instance=~"$Instance",{global_filters}}})',
|
| 174 |
+
legend="MAX",
|
| 175 |
+
),
|
| 176 |
+
# If max + pending > max, we display this value.
|
| 177 |
+
# (A and predicate) means to return A when the predicate satisfies in PromSql.
|
| 178 |
+
Target(
|
| 179 |
+
expr=f"({MAX_PLUS_PENDING_GPUS} and {MAX_PLUS_PENDING_GPUS} > ({MAX_GPUS} or vector(0)))",
|
| 180 |
+
legend="MAX + PENDING",
|
| 181 |
+
),
|
| 182 |
+
],
|
| 183 |
+
),
|
| 184 |
+
Panel(
|
| 185 |
+
id=40,
|
| 186 |
+
title="Scheduler Placement Groups",
|
| 187 |
+
description='Note: not impacted by "Instance" variable.\n\nCurrent number of placement groups in a particular state.\n\nState: the placement group state, as described by the rpc::PlacementGroupTable proto in gcs.proto.',
|
| 188 |
+
unit="placement groups",
|
| 189 |
+
targets=[
|
| 190 |
+
Target(
|
| 191 |
+
expr="sum(ray_placement_groups{{{global_filters}}}) by (State)",
|
| 192 |
+
legend="{{State}}",
|
| 193 |
+
)
|
| 194 |
+
],
|
| 195 |
+
),
|
| 196 |
+
Panel(
|
| 197 |
+
id=2,
|
| 198 |
+
title="Node CPU (hardware utilization)",
|
| 199 |
+
description="",
|
| 200 |
+
unit="cores",
|
| 201 |
+
targets=[
|
| 202 |
+
Target(
|
| 203 |
+
expr='ray_node_cpu_utilization{{instance=~"$Instance", IsHeadNode="false", {global_filters}}} * ray_node_cpu_count{{instance=~"$Instance",{global_filters}}} / 100',
|
| 204 |
+
legend="CPU Usage: {{instance}}",
|
| 205 |
+
),
|
| 206 |
+
Target(
|
| 207 |
+
expr='ray_node_cpu_utilization{{instance=~"$Instance", IsHeadNode="true", {global_filters}}} * ray_node_cpu_count{{instance=~"$Instance",{global_filters}}} / 100',
|
| 208 |
+
legend="CPU Usage: {{instance}} (head)",
|
| 209 |
+
),
|
| 210 |
+
Target(
|
| 211 |
+
expr='sum(ray_node_cpu_count{{instance=~"$Instance",{global_filters}}})',
|
| 212 |
+
legend="MAX",
|
| 213 |
+
),
|
| 214 |
+
],
|
| 215 |
+
),
|
| 216 |
+
Panel(
|
| 217 |
+
id=8,
|
| 218 |
+
title="Node GPU (hardware utilization)",
|
| 219 |
+
description="Node's physical (hardware) GPU usage. The dotted line means the total number of hardware GPUs from the cluster. ",
|
| 220 |
+
unit="GPUs",
|
| 221 |
+
targets=[
|
| 222 |
+
Target(
|
| 223 |
+
expr='ray_node_gpus_utilization{{instance=~"$Instance", IsHeadNode="false", {global_filters}}} / 100',
|
| 224 |
+
legend="GPU Usage: {{instance}}, gpu.{{GpuIndex}}, {{GpuDeviceName}}",
|
| 225 |
+
),
|
| 226 |
+
Target(
|
| 227 |
+
expr='ray_node_gpus_utilization{{instance=~"$Instance", IsHeadNode="true", {global_filters}}} / 100',
|
| 228 |
+
legend="GPU Usage: {{instance}} (head), gpu.{{GpuIndex}}, {{GpuDeviceName}}",
|
| 229 |
+
),
|
| 230 |
+
Target(
|
| 231 |
+
expr='sum(ray_node_gpus_available{{instance=~"$Instance",{global_filters}}})',
|
| 232 |
+
legend="MAX",
|
| 233 |
+
),
|
| 234 |
+
],
|
| 235 |
+
),
|
| 236 |
+
Panel(
|
| 237 |
+
id=6,
|
| 238 |
+
title="Node Disk",
|
| 239 |
+
description="Node's physical (hardware) disk usage. The dotted line means the total amount of disk space from the cluster.\n\nNOTE: When Ray is deployed within a container, this shows the disk usage from the host machine. ",
|
| 240 |
+
unit="bytes",
|
| 241 |
+
targets=[
|
| 242 |
+
Target(
|
| 243 |
+
expr='ray_node_disk_usage{{instance=~"$Instance", IsHeadNode="false", {global_filters}}}',
|
| 244 |
+
legend="Disk Used: {{instance}}",
|
| 245 |
+
),
|
| 246 |
+
Target(
|
| 247 |
+
expr='ray_node_disk_usage{{instance=~"$Instance", IsHeadNode="true", {global_filters}}}',
|
| 248 |
+
legend="Disk Used: {{instance}} (head)",
|
| 249 |
+
),
|
| 250 |
+
Target(
|
| 251 |
+
expr='sum(ray_node_disk_free{{instance=~"$Instance",{global_filters}}}) + sum(ray_node_disk_usage{{instance=~"$Instance",{global_filters}}})',
|
| 252 |
+
legend="MAX",
|
| 253 |
+
),
|
| 254 |
+
],
|
| 255 |
+
),
|
| 256 |
+
Panel(
|
| 257 |
+
id=32,
|
| 258 |
+
title="Node Disk IO Speed",
|
| 259 |
+
description="Disk IO per node.",
|
| 260 |
+
unit="Bps",
|
| 261 |
+
targets=[
|
| 262 |
+
Target(
|
| 263 |
+
expr='ray_node_disk_io_write_speed{{instance=~"$Instance", IsHeadNode="false", {global_filters}}}',
|
| 264 |
+
legend="Write: {{instance}}",
|
| 265 |
+
),
|
| 266 |
+
Target(
|
| 267 |
+
expr='ray_node_disk_io_write_speed{{instance=~"$Instance", IsHeadNode="true", {global_filters}}}',
|
| 268 |
+
legend="Write: {{instance}} (head)",
|
| 269 |
+
),
|
| 270 |
+
Target(
|
| 271 |
+
expr='ray_node_disk_io_read_speed{{instance=~"$Instance", IsHeadNode="false", {global_filters}}}',
|
| 272 |
+
legend="Read: {{instance}}",
|
| 273 |
+
),
|
| 274 |
+
Target(
|
| 275 |
+
expr='ray_node_disk_io_read_speed{{instance=~"$Instance", IsHeadNode="true", {global_filters}}}',
|
| 276 |
+
legend="Read: {{instance}} (head)",
|
| 277 |
+
),
|
| 278 |
+
],
|
| 279 |
+
),
|
| 280 |
+
Panel(
|
| 281 |
+
id=4,
|
| 282 |
+
title="Node Memory (heap + object store)",
|
| 283 |
+
description="The physical (hardware) memory usage for each node. The dotted line means the total amount of memory from the cluster. Node memory is a sum of object store memory (shared memory) and heap memory.\n\nNote: If Ray is deployed within a container, the total memory could be lower than the host machine because Ray may reserve some additional memory space outside the container.",
|
| 284 |
+
unit="bytes",
|
| 285 |
+
targets=[
|
| 286 |
+
Target(
|
| 287 |
+
expr='ray_node_mem_used{{instance=~"$Instance", IsHeadNode="false", {global_filters}}}',
|
| 288 |
+
legend="Memory Used: {{instance}}",
|
| 289 |
+
),
|
| 290 |
+
Target(
|
| 291 |
+
expr='ray_node_mem_used{{instance=~"$Instance", IsHeadNode="true", {global_filters}}}',
|
| 292 |
+
legend="Memory Used: {{instance}} (head)",
|
| 293 |
+
),
|
| 294 |
+
Target(
|
| 295 |
+
expr='sum(ray_node_mem_total{{instance=~"$Instance",{global_filters}}})',
|
| 296 |
+
legend="MAX",
|
| 297 |
+
),
|
| 298 |
+
],
|
| 299 |
+
),
|
| 300 |
+
Panel(
|
| 301 |
+
id=48,
|
| 302 |
+
title="Node Memory Percentage (heap + object store)",
|
| 303 |
+
description="The percentage of physical (hardware) memory usage for each node.",
|
| 304 |
+
unit="%",
|
| 305 |
+
targets=[
|
| 306 |
+
Target(
|
| 307 |
+
expr='ray_node_mem_used{{instance=~"$Instance", IsHeadNode="false", {global_filters}}}/ray_node_mem_total{{instance=~"$Instance", IsHeadNode="false", {global_filters}}} * 100',
|
| 308 |
+
legend="Memory Used: {{instance}}",
|
| 309 |
+
),
|
| 310 |
+
Target(
|
| 311 |
+
expr='ray_node_mem_used{{instance=~"$Instance", IsHeadNode="true", {global_filters}}}/ray_node_mem_total{{instance=~"$Instance", IsHeadNode="true", {global_filters}}} * 100',
|
| 312 |
+
legend="Memory Used: {{instance}} (head)",
|
| 313 |
+
),
|
| 314 |
+
],
|
| 315 |
+
fill=0,
|
| 316 |
+
stack=False,
|
| 317 |
+
),
|
| 318 |
+
Panel(
|
| 319 |
+
id=44,
|
| 320 |
+
title="Node Out of Memory Failures by Name",
|
| 321 |
+
description="The number of tasks and actors killed by the Ray Out of Memory killer due to high memory pressure. Metrics are broken down by IP and the name. https://docs.ray.io/en/master/ray-core/scheduling/ray-oom-prevention.html.",
|
| 322 |
+
unit="failures",
|
| 323 |
+
targets=[
|
| 324 |
+
Target(
|
| 325 |
+
expr='ray_memory_manager_worker_eviction_total{{instance=~"$Instance",{global_filters}}}',
|
| 326 |
+
legend="OOM Killed: {{Name}}, {{instance}}",
|
| 327 |
+
),
|
| 328 |
+
],
|
| 329 |
+
),
|
| 330 |
+
Panel(
|
| 331 |
+
id=34,
|
| 332 |
+
title="Node Memory by Component",
|
| 333 |
+
description="The physical (hardware) memory usage across the cluster, broken down by component. This reports the summed RSS-SHM per Ray component, which corresponds to an approximate memory usage per proc. Ray components consist of system components (e.g., raylet, gcs, dashboard, or agent) and the process (that contains method names) names of running tasks/actors.",
|
| 334 |
+
unit="bytes",
|
| 335 |
+
targets=[
|
| 336 |
+
Target(
|
| 337 |
+
expr='(sum(ray_component_rss_mb{{instance=~"$Instance",{global_filters}}} * 1e6) by (Component)) - (sum(ray_component_mem_shared_bytes{{instance=~"$Instance",{global_filters}}}) by (Component))',
|
| 338 |
+
legend="{{Component}}",
|
| 339 |
+
),
|
| 340 |
+
Target(
|
| 341 |
+
expr='sum(ray_node_mem_shared_bytes{{instance=~"$Instance",{global_filters}}})',
|
| 342 |
+
legend="shared_memory",
|
| 343 |
+
),
|
| 344 |
+
Target(
|
| 345 |
+
expr='sum(ray_node_mem_total{{instance=~"$Instance",{global_filters}}})',
|
| 346 |
+
legend="MAX",
|
| 347 |
+
),
|
| 348 |
+
],
|
| 349 |
+
),
|
| 350 |
+
Panel(
|
| 351 |
+
id=37,
|
| 352 |
+
title="Node CPU by Component",
|
| 353 |
+
description="The physical (hardware) CPU usage across the cluster, broken down by component. This reports the summed CPU usage per Ray component. Ray components consist of system components (e.g., raylet, gcs, dashboard, or agent) and the process (that contains method names) names of running tasks/actors.",
|
| 354 |
+
unit="cores",
|
| 355 |
+
targets=[
|
| 356 |
+
Target(
|
| 357 |
+
# ray_component_cpu_percentage returns a percentage that can be > 100. It means that it uses more than 1 CPU.
|
| 358 |
+
expr='sum(ray_component_cpu_percentage{{instance=~"$Instance",{global_filters}}}) by (Component) / 100',
|
| 359 |
+
legend="{{Component}}",
|
| 360 |
+
),
|
| 361 |
+
Target(
|
| 362 |
+
expr='sum(ray_node_cpu_count{{instance=~"$Instance",{global_filters}}})',
|
| 363 |
+
legend="MAX",
|
| 364 |
+
),
|
| 365 |
+
],
|
| 366 |
+
),
|
| 367 |
+
Panel(
|
| 368 |
+
id=18,
|
| 369 |
+
title="Node GPU Memory (GRAM)",
|
| 370 |
+
description="The physical (hardware) GPU memory usage for each node. The dotted line means the total amount of GPU memory from the cluster.",
|
| 371 |
+
unit="bytes",
|
| 372 |
+
targets=[
|
| 373 |
+
Target(
|
| 374 |
+
expr='ray_node_gram_used{{instance=~"$Instance",{global_filters}}} * 1024 * 1024',
|
| 375 |
+
legend="Used GRAM: {{instance}}, gpu.{{GpuIndex}}, {{GpuDeviceName}}",
|
| 376 |
+
),
|
| 377 |
+
Target(
|
| 378 |
+
expr='(sum(ray_node_gram_available{{instance=~"$Instance",{global_filters}}}) + sum(ray_node_gram_used{{instance=~"$Instance",{global_filters}}})) * 1024 * 1024',
|
| 379 |
+
legend="MAX",
|
| 380 |
+
),
|
| 381 |
+
],
|
| 382 |
+
),
|
| 383 |
+
Panel(
|
| 384 |
+
id=20,
|
| 385 |
+
title="Node Network",
|
| 386 |
+
description="Network speed per node",
|
| 387 |
+
unit="Bps",
|
| 388 |
+
targets=[
|
| 389 |
+
Target(
|
| 390 |
+
expr='ray_node_network_receive_speed{{instance=~"$Instance",{global_filters}}}',
|
| 391 |
+
legend="Recv: {{instance}}",
|
| 392 |
+
),
|
| 393 |
+
Target(
|
| 394 |
+
expr='ray_node_network_send_speed{{instance=~"$Instance",{global_filters}}}',
|
| 395 |
+
legend="Send: {{instance}}",
|
| 396 |
+
),
|
| 397 |
+
],
|
| 398 |
+
),
|
| 399 |
+
Panel(
|
| 400 |
+
id=24,
|
| 401 |
+
title="Node Count",
|
| 402 |
+
description='Note: not impacted by "Instance" variable.\n\nA total number of active failed, and pending nodes from the cluster. \n\nACTIVE: A node is alive and available.\n\nFAILED: A node is dead and not available. The node is considered dead when the raylet process on the node is terminated. The node will get into the failed state if it cannot be provided (e.g., there\'s no available node from the cloud provider) or failed to setup (e.g., setup_commands have errors). \n\nPending: A node is being started by the Ray cluster launcher. The node is unavailable now because it is being provisioned and initialized.',
|
| 403 |
+
unit="nodes",
|
| 404 |
+
targets=[
|
| 405 |
+
Target(
|
| 406 |
+
expr="sum(autoscaler_active_nodes{{{global_filters}}}) by (NodeType)",
|
| 407 |
+
legend="Active Nodes: {{NodeType}}",
|
| 408 |
+
),
|
| 409 |
+
Target(
|
| 410 |
+
expr="sum(autoscaler_recently_failed_nodes{{{global_filters}}}) by (NodeType)",
|
| 411 |
+
legend="Failed Nodes: {{NodeType}}",
|
| 412 |
+
),
|
| 413 |
+
Target(
|
| 414 |
+
expr="sum(autoscaler_pending_nodes{{{global_filters}}}) by (NodeType)",
|
| 415 |
+
legend="Pending Nodes: {{NodeType}}",
|
| 416 |
+
),
|
| 417 |
+
],
|
| 418 |
+
),
|
| 419 |
+
Panel(
|
| 420 |
+
id=41,
|
| 421 |
+
title="Cluster Utilization",
|
| 422 |
+
description="Aggregated utilization of all physical resources (CPU, GPU, memory, disk, or etc.) across the cluster.",
|
| 423 |
+
unit="%",
|
| 424 |
+
targets=[
|
| 425 |
+
# CPU
|
| 426 |
+
Target(
|
| 427 |
+
expr='avg(ray_node_cpu_utilization{{instance=~"$Instance",{global_filters}}})',
|
| 428 |
+
legend="CPU (physical)",
|
| 429 |
+
),
|
| 430 |
+
# GPU
|
| 431 |
+
Target(
|
| 432 |
+
expr='sum(ray_node_gpus_utilization{{instance=~"$Instance",{global_filters}}}) / on() (sum(autoscaler_cluster_resources{{resource="GPU",instance=~"$Instance",{global_filters}}}) or vector(0))',
|
| 433 |
+
legend="GPU (physical)",
|
| 434 |
+
),
|
| 435 |
+
# Memory
|
| 436 |
+
Target(
|
| 437 |
+
expr='sum(ray_node_mem_used{{instance=~"$Instance",{global_filters}}}) / on() (sum(ray_node_mem_total{{instance=~"$Instance",{global_filters}}})) * 100',
|
| 438 |
+
legend="Memory (RAM)",
|
| 439 |
+
),
|
| 440 |
+
# GRAM
|
| 441 |
+
Target(
|
| 442 |
+
expr='sum(ray_node_gram_used{{instance=~"$Instance",{global_filters}}}) / on() (sum(ray_node_gram_available{{instance=~"$Instance",{global_filters}}}) + sum(ray_node_gram_used{{instance=~"$Instance",{global_filters}}})) * 100',
|
| 443 |
+
legend="GRAM",
|
| 444 |
+
),
|
| 445 |
+
# Object Store
|
| 446 |
+
Target(
|
| 447 |
+
expr='sum(ray_object_store_memory{{instance=~"$Instance",{global_filters}}}) / on() sum(ray_resources{{Name="object_store_memory",instance=~"$Instance",{global_filters}}}) * 100',
|
| 448 |
+
legend="Object Store Memory",
|
| 449 |
+
),
|
| 450 |
+
# Disk
|
| 451 |
+
Target(
|
| 452 |
+
expr='sum(ray_node_disk_usage{{instance=~"$Instance",{global_filters}}}) / on() (sum(ray_node_disk_free{{instance=~"$Instance",{global_filters}}}) + sum(ray_node_disk_usage{{instance=~"$Instance",{global_filters}}})) * 100',
|
| 453 |
+
legend="Disk",
|
| 454 |
+
),
|
| 455 |
+
],
|
| 456 |
+
fill=0,
|
| 457 |
+
stack=False,
|
| 458 |
+
),
|
| 459 |
+
]
|
| 460 |
+
|
| 461 |
+
|
| 462 |
+
ids = []
|
| 463 |
+
for panel in DEFAULT_GRAFANA_PANELS:
|
| 464 |
+
ids.append(panel.id)
|
| 465 |
+
assert len(ids) == len(
|
| 466 |
+
set(ids)
|
| 467 |
+
), f"Duplicated id found. Use unique id for each panel. {ids}"
|
| 468 |
+
|
| 469 |
+
default_dashboard_config = DashboardConfig(
|
| 470 |
+
name="DEFAULT",
|
| 471 |
+
default_uid="rayDefaultDashboard",
|
| 472 |
+
panels=DEFAULT_GRAFANA_PANELS,
|
| 473 |
+
standard_global_filters=[
|
| 474 |
+
'SessionName=~"$SessionName"',
|
| 475 |
+
'ray_io_cluster=~"$Cluster"',
|
| 476 |
+
],
|
| 477 |
+
base_json_file_name="default_grafana_dashboard_base.json",
|
| 478 |
+
)
|
.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/dashboards/serve_dashboard_panels.py
ADDED
|
@@ -0,0 +1,420 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ruff: noqa: E501
|
| 2 |
+
|
| 3 |
+
from ray.dashboard.modules.metrics.dashboards.common import (
|
| 4 |
+
DashboardConfig,
|
| 5 |
+
GridPos,
|
| 6 |
+
Panel,
|
| 7 |
+
Target,
|
| 8 |
+
)
|
| 9 |
+
|
| 10 |
+
SERVE_GRAFANA_PANELS = [
|
| 11 |
+
Panel(
|
| 12 |
+
id=5,
|
| 13 |
+
title="Cluster Utilization",
|
| 14 |
+
description="Aggregated utilization of all physical resources (CPU, GPU, memory, disk, or etc.) across the cluster. Ignores application variable.",
|
| 15 |
+
unit="%",
|
| 16 |
+
targets=[
|
| 17 |
+
# CPU
|
| 18 |
+
Target(
|
| 19 |
+
expr="avg(ray_node_cpu_utilization{{{global_filters}}})",
|
| 20 |
+
legend="CPU (physical)",
|
| 21 |
+
),
|
| 22 |
+
# GPU
|
| 23 |
+
Target(
|
| 24 |
+
expr="sum(ray_node_gpus_utilization{{{global_filters}}}) / on() (sum(autoscaler_cluster_resources{{resource='GPU',{global_filters}}}) or vector(0))",
|
| 25 |
+
legend="GPU (physical)",
|
| 26 |
+
),
|
| 27 |
+
# Memory
|
| 28 |
+
Target(
|
| 29 |
+
expr="sum(ray_node_mem_used{{{global_filters}}}) / on() (sum(ray_node_mem_total{{{global_filters}}})) * 100",
|
| 30 |
+
legend="Memory (RAM)",
|
| 31 |
+
),
|
| 32 |
+
# GRAM
|
| 33 |
+
Target(
|
| 34 |
+
expr="sum(ray_node_gram_used{{{global_filters}}}) / on() (sum(ray_node_gram_available{{{global_filters}}}) + sum(ray_node_gram_used{{{global_filters}}})) * 100",
|
| 35 |
+
legend="GRAM",
|
| 36 |
+
),
|
| 37 |
+
# Object Store
|
| 38 |
+
Target(
|
| 39 |
+
expr='sum(ray_object_store_memory{{{global_filters}}}) / on() sum(ray_resources{{Name="object_store_memory",{global_filters}}}) * 100',
|
| 40 |
+
legend="Object Store Memory",
|
| 41 |
+
),
|
| 42 |
+
# Disk
|
| 43 |
+
Target(
|
| 44 |
+
expr="sum(ray_node_disk_usage{{{global_filters}}}) / on() (sum(ray_node_disk_free{{{global_filters}}}) + sum(ray_node_disk_usage{{{global_filters}}})) * 100",
|
| 45 |
+
legend="Disk",
|
| 46 |
+
),
|
| 47 |
+
],
|
| 48 |
+
fill=0,
|
| 49 |
+
stack=False,
|
| 50 |
+
grid_pos=GridPos(0, 0, 12, 8),
|
| 51 |
+
),
|
| 52 |
+
Panel(
|
| 53 |
+
id=7,
|
| 54 |
+
title="QPS per application",
|
| 55 |
+
description="QPS for each selected application.",
|
| 56 |
+
unit="qps",
|
| 57 |
+
targets=[
|
| 58 |
+
Target(
|
| 59 |
+
expr='sum(rate(ray_serve_num_http_requests_total{{application=~"$Application",application!~"",route=~"$HTTP_Route",route!~"/-/.*",{global_filters}}}[5m])) by (application, route)',
|
| 60 |
+
legend="{{application, route}}",
|
| 61 |
+
),
|
| 62 |
+
Target(
|
| 63 |
+
expr='sum(rate(ray_serve_num_grpc_requests_total{{application=~"$Application",application!~"",method=~"$gRPC_Method",{global_filters}}}[5m])) by (application, method)',
|
| 64 |
+
legend="{{application, method}}",
|
| 65 |
+
),
|
| 66 |
+
],
|
| 67 |
+
grid_pos=GridPos(12, 0, 12, 8),
|
| 68 |
+
),
|
| 69 |
+
Panel(
|
| 70 |
+
id=8,
|
| 71 |
+
title="Error QPS per application",
|
| 72 |
+
description="Error QPS for each selected application.",
|
| 73 |
+
unit="qps",
|
| 74 |
+
targets=[
|
| 75 |
+
Target(
|
| 76 |
+
expr='sum(rate(ray_serve_num_http_error_requests_total{{application=~"$Application",application!~"",route=~"$HTTP_Route",route!~"/-/.*",{global_filters}}}[5m])) by (application, route)',
|
| 77 |
+
legend="{{application, route}}",
|
| 78 |
+
),
|
| 79 |
+
Target(
|
| 80 |
+
expr='sum(rate(ray_serve_num_grpc_error_requests_total{{application=~"$Application",application!~"",method=~"$gRPC_Method",{global_filters}}}[5m])) by (application, method)',
|
| 81 |
+
legend="{{application, method}}",
|
| 82 |
+
),
|
| 83 |
+
],
|
| 84 |
+
grid_pos=GridPos(0, 1, 12, 8),
|
| 85 |
+
),
|
| 86 |
+
Panel(
|
| 87 |
+
id=17,
|
| 88 |
+
title="Error QPS per application per error code",
|
| 89 |
+
description="Error QPS for each selected application.",
|
| 90 |
+
unit="qps",
|
| 91 |
+
targets=[
|
| 92 |
+
Target(
|
| 93 |
+
expr='sum(rate(ray_serve_num_http_error_requests_total{{application=~"$Application",application!~"",route=~"$HTTP_Route",route!~"/-/.*",{global_filters}}}[5m])) by (application, route, error_code)',
|
| 94 |
+
legend="{{application, route, error_code}}",
|
| 95 |
+
),
|
| 96 |
+
Target(
|
| 97 |
+
expr='sum(rate(ray_serve_num_grpc_error_requests_total{{application=~"$Application",application!~"",method=~"$gRPC_Method",{global_filters}}}[5m])) by (application, method, error_code)',
|
| 98 |
+
legend="{{application, method, error_code}}",
|
| 99 |
+
),
|
| 100 |
+
],
|
| 101 |
+
grid_pos=GridPos(12, 1, 12, 8),
|
| 102 |
+
),
|
| 103 |
+
Panel(
|
| 104 |
+
id=12,
|
| 105 |
+
title="P50 latency per application",
|
| 106 |
+
description="P50 latency for selected applications.",
|
| 107 |
+
unit="ms",
|
| 108 |
+
targets=[
|
| 109 |
+
Target(
|
| 110 |
+
expr='histogram_quantile(0.5, sum(rate(ray_serve_http_request_latency_ms_bucket{{application=~"$Application",application!~"",route=~"$HTTP_Route",route!~"/-/.*",{global_filters}}}[5m])) by (application, route, le))',
|
| 111 |
+
legend="{{application, route}}",
|
| 112 |
+
),
|
| 113 |
+
Target(
|
| 114 |
+
expr='histogram_quantile(0.5, sum(rate(ray_serve_grpc_request_latency_ms_bucket{{application=~"$Application",application!~"",method=~"$gRPC_Method",{global_filters}}}[5m])) by (application, method, le))',
|
| 115 |
+
legend="{{application, method}}",
|
| 116 |
+
),
|
| 117 |
+
Target(
|
| 118 |
+
expr='histogram_quantile(0.5, sum(rate({{__name__=~ "ray_serve_(http|grpc)_request_latency_ms_bucket",application=~"$Application",application!~"",{global_filters}}}[5m])) by (le))',
|
| 119 |
+
legend="Total",
|
| 120 |
+
),
|
| 121 |
+
],
|
| 122 |
+
fill=0,
|
| 123 |
+
stack=False,
|
| 124 |
+
grid_pos=GridPos(0, 2, 8, 8),
|
| 125 |
+
),
|
| 126 |
+
Panel(
|
| 127 |
+
id=15,
|
| 128 |
+
title="P90 latency per application",
|
| 129 |
+
description="P90 latency for selected applications.",
|
| 130 |
+
unit="ms",
|
| 131 |
+
targets=[
|
| 132 |
+
Target(
|
| 133 |
+
expr='histogram_quantile(0.9, sum(rate(ray_serve_http_request_latency_ms_bucket{{application=~"$Application",application!~"",route=~"$HTTP_Route",route!~"/-/.*",{global_filters}}}[5m])) by (application, route, le))',
|
| 134 |
+
legend="{{application, route}}",
|
| 135 |
+
),
|
| 136 |
+
Target(
|
| 137 |
+
expr='histogram_quantile(0.9, sum(rate(ray_serve_grpc_request_latency_ms_bucket{{application=~"$Application",application!~"",method=~"$gRPC_Method",{global_filters}}}[5m])) by (application, method, le))',
|
| 138 |
+
legend="{{application, method}}",
|
| 139 |
+
),
|
| 140 |
+
Target(
|
| 141 |
+
expr='histogram_quantile(0.9, sum(rate({{__name__=~ "ray_serve_(http|grpc)_request_latency_ms_bucket|ray_serve_grpc_request_latency_ms_bucket",application=~"$Application",application!~"",{global_filters}}}[5m])) by (le))',
|
| 142 |
+
legend="Total",
|
| 143 |
+
),
|
| 144 |
+
],
|
| 145 |
+
fill=0,
|
| 146 |
+
stack=False,
|
| 147 |
+
grid_pos=GridPos(8, 2, 8, 8),
|
| 148 |
+
),
|
| 149 |
+
Panel(
|
| 150 |
+
id=16,
|
| 151 |
+
title="P99 latency per application",
|
| 152 |
+
description="P99 latency for selected applications.",
|
| 153 |
+
unit="ms",
|
| 154 |
+
targets=[
|
| 155 |
+
Target(
|
| 156 |
+
expr='histogram_quantile(0.99, sum(rate(ray_serve_http_request_latency_ms_bucket{{application=~"$Application",application!~"",route=~"$HTTP_Route",route!~"/-/.*",{global_filters}}}[5m])) by (application, route, le))',
|
| 157 |
+
legend="{{application, route}}",
|
| 158 |
+
),
|
| 159 |
+
Target(
|
| 160 |
+
expr='histogram_quantile(0.99, sum(rate(ray_serve_grpc_request_latency_ms_bucket{{application=~"$Application",application!~"",method=~"$gRPC_Method",{global_filters}}}[5m])) by (application, method, le))',
|
| 161 |
+
legend="{{application, method}}",
|
| 162 |
+
),
|
| 163 |
+
Target(
|
| 164 |
+
expr='histogram_quantile(0.99, sum(rate({{__name__=~ "ray_serve_(http|grpc)_request_latency_ms_bucket|ray_serve_grpc_request_latency_ms_bucket",application=~"$Application",application!~"",{global_filters}}}[5m])) by (le))',
|
| 165 |
+
legend="Total",
|
| 166 |
+
),
|
| 167 |
+
],
|
| 168 |
+
fill=0,
|
| 169 |
+
stack=False,
|
| 170 |
+
grid_pos=GridPos(16, 2, 8, 8),
|
| 171 |
+
),
|
| 172 |
+
Panel(
|
| 173 |
+
id=2,
|
| 174 |
+
title="Replicas per deployment",
|
| 175 |
+
description='Number of replicas per deployment. Ignores "Application" variable.',
|
| 176 |
+
unit="replicas",
|
| 177 |
+
targets=[
|
| 178 |
+
Target(
|
| 179 |
+
expr="sum(ray_serve_deployment_replica_healthy{{{global_filters}}}) by (application, deployment)",
|
| 180 |
+
legend="{{application, deployment}}",
|
| 181 |
+
),
|
| 182 |
+
],
|
| 183 |
+
grid_pos=GridPos(0, 3, 8, 8),
|
| 184 |
+
),
|
| 185 |
+
Panel(
|
| 186 |
+
id=13,
|
| 187 |
+
title="QPS per deployment",
|
| 188 |
+
description="QPS for each deployment.",
|
| 189 |
+
unit="qps",
|
| 190 |
+
targets=[
|
| 191 |
+
Target(
|
| 192 |
+
expr='sum(rate(ray_serve_deployment_request_counter_total{{application=~"$Application",application!~"",{global_filters}}}[5m])) by (application, deployment)',
|
| 193 |
+
legend="{{application, deployment}}",
|
| 194 |
+
),
|
| 195 |
+
],
|
| 196 |
+
grid_pos=GridPos(8, 3, 8, 8),
|
| 197 |
+
),
|
| 198 |
+
Panel(
|
| 199 |
+
id=14,
|
| 200 |
+
title="Error QPS per deployment",
|
| 201 |
+
description="Error QPS for each deplyoment.",
|
| 202 |
+
unit="qps",
|
| 203 |
+
targets=[
|
| 204 |
+
Target(
|
| 205 |
+
expr='sum(rate(ray_serve_deployment_error_counter_total{{application=~"$Application",application!~"",{global_filters}}}[5m])) by (application, deployment)',
|
| 206 |
+
legend="{{application, deployment}}",
|
| 207 |
+
),
|
| 208 |
+
],
|
| 209 |
+
grid_pos=GridPos(16, 3, 8, 8),
|
| 210 |
+
),
|
| 211 |
+
Panel(
|
| 212 |
+
id=9,
|
| 213 |
+
title="P50 latency per deployment",
|
| 214 |
+
description="P50 latency per deployment.",
|
| 215 |
+
unit="ms",
|
| 216 |
+
targets=[
|
| 217 |
+
Target(
|
| 218 |
+
expr='histogram_quantile(0.5, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{{application=~"$Application",application!~"",{global_filters}}}[5m])) by (application, deployment, le))',
|
| 219 |
+
legend="{{application, deployment}}",
|
| 220 |
+
),
|
| 221 |
+
Target(
|
| 222 |
+
expr='histogram_quantile(0.5, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{{application=~"$Application",application!~"",{global_filters}}}[5m])) by (le))',
|
| 223 |
+
legend="Total",
|
| 224 |
+
),
|
| 225 |
+
],
|
| 226 |
+
fill=0,
|
| 227 |
+
stack=False,
|
| 228 |
+
grid_pos=GridPos(0, 4, 8, 8),
|
| 229 |
+
),
|
| 230 |
+
Panel(
|
| 231 |
+
id=10,
|
| 232 |
+
title="P90 latency per deployment",
|
| 233 |
+
description="P90 latency per deployment.",
|
| 234 |
+
unit="ms",
|
| 235 |
+
targets=[
|
| 236 |
+
Target(
|
| 237 |
+
expr='histogram_quantile(0.9, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{{application=~"$Application",application!~"",{global_filters}}}[5m])) by (application, deployment, le))',
|
| 238 |
+
legend="{{application, deployment}}",
|
| 239 |
+
),
|
| 240 |
+
Target(
|
| 241 |
+
expr='histogram_quantile(0.9, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{{application=~"$Application",application!~"",{global_filters}}}[5m])) by (le))',
|
| 242 |
+
legend="Total",
|
| 243 |
+
),
|
| 244 |
+
],
|
| 245 |
+
fill=0,
|
| 246 |
+
stack=False,
|
| 247 |
+
grid_pos=GridPos(8, 4, 8, 8),
|
| 248 |
+
),
|
| 249 |
+
Panel(
|
| 250 |
+
id=11,
|
| 251 |
+
title="P99 latency per deployment",
|
| 252 |
+
description="P99 latency per deployment.",
|
| 253 |
+
unit="ms",
|
| 254 |
+
targets=[
|
| 255 |
+
Target(
|
| 256 |
+
expr='histogram_quantile(0.99, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{{application=~"$Application",application!~"",{global_filters}}}[5m])) by (application, deployment, le))',
|
| 257 |
+
legend="{{application, deployment}}",
|
| 258 |
+
),
|
| 259 |
+
Target(
|
| 260 |
+
expr='histogram_quantile(0.99, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{{application=~"$Application",application!~"",{global_filters}}}[5m])) by (le))',
|
| 261 |
+
legend="Total",
|
| 262 |
+
),
|
| 263 |
+
],
|
| 264 |
+
fill=0,
|
| 265 |
+
stack=False,
|
| 266 |
+
grid_pos=GridPos(16, 4, 8, 8),
|
| 267 |
+
),
|
| 268 |
+
Panel(
|
| 269 |
+
id=3,
|
| 270 |
+
title="Queue size per deployment",
|
| 271 |
+
description='Number of requests queued per deployment. Ignores "Application" variable.',
|
| 272 |
+
unit="requests",
|
| 273 |
+
targets=[
|
| 274 |
+
Target(
|
| 275 |
+
expr="sum(ray_serve_deployment_queued_queries{{{global_filters}}}) by (application, deployment)",
|
| 276 |
+
legend="{{application, deployment}}",
|
| 277 |
+
),
|
| 278 |
+
],
|
| 279 |
+
fill=0,
|
| 280 |
+
stack=False,
|
| 281 |
+
grid_pos=GridPos(0, 5, 8, 8),
|
| 282 |
+
),
|
| 283 |
+
Panel(
|
| 284 |
+
id=4,
|
| 285 |
+
title="Node count",
|
| 286 |
+
description='Number of nodes in this cluster. Ignores "Application" variable.',
|
| 287 |
+
unit="nodes",
|
| 288 |
+
targets=[
|
| 289 |
+
# TODO(aguo): Update this to use autoscaler metrics instead
|
| 290 |
+
Target(
|
| 291 |
+
expr="sum(autoscaler_active_nodes{{{global_filters}}}) by (NodeType)",
|
| 292 |
+
legend="Active Nodes: {{NodeType}}",
|
| 293 |
+
),
|
| 294 |
+
Target(
|
| 295 |
+
expr="sum(autoscaler_recently_failed_nodes{{{global_filters}}}) by (NodeType)",
|
| 296 |
+
legend="Failed Nodes: {{NodeType}}",
|
| 297 |
+
),
|
| 298 |
+
Target(
|
| 299 |
+
expr="sum(autoscaler_pending_nodes{{{global_filters}}}) by (NodeType)",
|
| 300 |
+
legend="Pending Nodes: {{NodeType}}",
|
| 301 |
+
),
|
| 302 |
+
],
|
| 303 |
+
grid_pos=GridPos(8, 5, 8, 8),
|
| 304 |
+
),
|
| 305 |
+
Panel(
|
| 306 |
+
id=6,
|
| 307 |
+
title="Node network",
|
| 308 |
+
description='Network speed per node. Ignores "Application" variable.',
|
| 309 |
+
unit="Bps",
|
| 310 |
+
targets=[
|
| 311 |
+
Target(
|
| 312 |
+
expr="sum(ray_node_network_receive_speed{{{global_filters}}}) by (instance)",
|
| 313 |
+
legend="Recv: {{instance}}",
|
| 314 |
+
),
|
| 315 |
+
Target(
|
| 316 |
+
expr="sum(ray_node_network_send_speed{{{global_filters}}}) by (instance)",
|
| 317 |
+
legend="Send: {{instance}}",
|
| 318 |
+
),
|
| 319 |
+
],
|
| 320 |
+
fill=1,
|
| 321 |
+
linewidth=2,
|
| 322 |
+
stack=False,
|
| 323 |
+
grid_pos=GridPos(16, 5, 8, 8),
|
| 324 |
+
),
|
| 325 |
+
Panel(
|
| 326 |
+
id=20,
|
| 327 |
+
title="Ongoing HTTP Requests",
|
| 328 |
+
description="The number of ongoing requests in the HTTP Proxy.",
|
| 329 |
+
unit="requests",
|
| 330 |
+
targets=[
|
| 331 |
+
Target(
|
| 332 |
+
expr="ray_serve_num_ongoing_http_requests{{{global_filters}}}",
|
| 333 |
+
legend="Ongoing HTTP Requests",
|
| 334 |
+
),
|
| 335 |
+
],
|
| 336 |
+
grid_pos=GridPos(0, 6, 8, 8),
|
| 337 |
+
),
|
| 338 |
+
Panel(
|
| 339 |
+
id=21,
|
| 340 |
+
title="Ongoing gRPC Requests",
|
| 341 |
+
description="The number of ongoing requests in the gRPC Proxy.",
|
| 342 |
+
unit="requests",
|
| 343 |
+
targets=[
|
| 344 |
+
Target(
|
| 345 |
+
expr="ray_serve_num_ongoing_grpc_requests{{{global_filters}}}",
|
| 346 |
+
legend="Ongoing gRPC Requests",
|
| 347 |
+
),
|
| 348 |
+
],
|
| 349 |
+
grid_pos=GridPos(8, 6, 8, 8),
|
| 350 |
+
),
|
| 351 |
+
Panel(
|
| 352 |
+
id=22,
|
| 353 |
+
title="Scheduling Tasks",
|
| 354 |
+
description="The number of request scheduling tasks in the router.",
|
| 355 |
+
unit="tasks",
|
| 356 |
+
targets=[
|
| 357 |
+
Target(
|
| 358 |
+
expr="ray_serve_num_scheduling_tasks{{{global_filters}}}",
|
| 359 |
+
legend="Scheduling Tasks",
|
| 360 |
+
),
|
| 361 |
+
],
|
| 362 |
+
grid_pos=GridPos(16, 6, 8, 8),
|
| 363 |
+
),
|
| 364 |
+
Panel(
|
| 365 |
+
id=23,
|
| 366 |
+
title="Scheduling Tasks in Backoff",
|
| 367 |
+
description="The number of request scheduling tasks in the router that are undergoing backoff.",
|
| 368 |
+
unit="tasks",
|
| 369 |
+
targets=[
|
| 370 |
+
Target(
|
| 371 |
+
expr="ray_serve_num_scheduling_tasks_in_backoff{{{global_filters}}}",
|
| 372 |
+
legend="Scheduling Tasks in Backoff",
|
| 373 |
+
),
|
| 374 |
+
],
|
| 375 |
+
grid_pos=GridPos(0, 7, 8, 8),
|
| 376 |
+
),
|
| 377 |
+
Panel(
|
| 378 |
+
id=24,
|
| 379 |
+
title="Controller Control Loop Duration",
|
| 380 |
+
description="The duration of the last control loop.",
|
| 381 |
+
unit="seconds",
|
| 382 |
+
targets=[
|
| 383 |
+
Target(
|
| 384 |
+
expr="ray_serve_controller_control_loop_duration_s{{{global_filters}}}",
|
| 385 |
+
legend="Control Loop Duration",
|
| 386 |
+
),
|
| 387 |
+
],
|
| 388 |
+
grid_pos=GridPos(8, 7, 8, 8),
|
| 389 |
+
),
|
| 390 |
+
Panel(
|
| 391 |
+
id=25,
|
| 392 |
+
title="Number of Control Loops",
|
| 393 |
+
description="The number of control loops performed by the controller. Increases monotonically over the controller's lifetime.",
|
| 394 |
+
unit="loops",
|
| 395 |
+
targets=[
|
| 396 |
+
Target(
|
| 397 |
+
expr="ray_serve_controller_num_control_loops{{{global_filters}}}",
|
| 398 |
+
legend="Control Loops",
|
| 399 |
+
),
|
| 400 |
+
],
|
| 401 |
+
grid_pos=GridPos(16, 7, 8, 8),
|
| 402 |
+
),
|
| 403 |
+
]
|
| 404 |
+
|
| 405 |
+
ids = []
|
| 406 |
+
for panel in SERVE_GRAFANA_PANELS:
|
| 407 |
+
ids.append(panel.id)
|
| 408 |
+
assert len(ids) == len(
|
| 409 |
+
set(ids)
|
| 410 |
+
), f"Duplicated id found. Use unique id for each panel. {ids}"
|
| 411 |
+
|
| 412 |
+
serve_dashboard_config = DashboardConfig(
|
| 413 |
+
name="SERVE",
|
| 414 |
+
default_uid="rayServeDashboard",
|
| 415 |
+
panels=SERVE_GRAFANA_PANELS,
|
| 416 |
+
standard_global_filters=[
|
| 417 |
+
'ray_io_cluster=~"$Cluster"',
|
| 418 |
+
],
|
| 419 |
+
base_json_file_name="serve_grafana_dashboard_base.json",
|
| 420 |
+
)
|
.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/dashboards/serve_deployment_dashboard_panels.py
ADDED
|
@@ -0,0 +1,259 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# ruff: noqa: E501
|
| 2 |
+
|
| 3 |
+
from ray.dashboard.modules.metrics.dashboards.common import (
|
| 4 |
+
DashboardConfig,
|
| 5 |
+
GridPos,
|
| 6 |
+
Panel,
|
| 7 |
+
Target,
|
| 8 |
+
)
|
| 9 |
+
|
| 10 |
+
SERVE_DEPLOYMENT_GRAFANA_PANELS = [
|
| 11 |
+
Panel(
|
| 12 |
+
id=1,
|
| 13 |
+
title="Replicas per deployment",
|
| 14 |
+
description='Number of replicas per deployment. Ignores "Route" variable.',
|
| 15 |
+
unit="replicas",
|
| 16 |
+
targets=[
|
| 17 |
+
Target(
|
| 18 |
+
expr="sum(ray_serve_deployment_replica_healthy{{{global_filters}}}) by (application, deployment)",
|
| 19 |
+
legend="{{application, deployment}}",
|
| 20 |
+
),
|
| 21 |
+
],
|
| 22 |
+
grid_pos=GridPos(0, 0, 8, 8),
|
| 23 |
+
),
|
| 24 |
+
Panel(
|
| 25 |
+
id=2,
|
| 26 |
+
title="QPS per replica",
|
| 27 |
+
description="QPS for each replica.",
|
| 28 |
+
unit="qps",
|
| 29 |
+
targets=[
|
| 30 |
+
Target(
|
| 31 |
+
expr='sum(rate(ray_serve_deployment_request_counter_total{{route=~"$Route",route!~"/-/.*",{global_filters}}}[5m])) by (application, deployment, replica)',
|
| 32 |
+
legend="{{replica}}",
|
| 33 |
+
),
|
| 34 |
+
],
|
| 35 |
+
grid_pos=GridPos(8, 0, 8, 8),
|
| 36 |
+
),
|
| 37 |
+
Panel(
|
| 38 |
+
id=3,
|
| 39 |
+
title="Error QPS per replica",
|
| 40 |
+
description="Error QPS for each replica.",
|
| 41 |
+
unit="qps",
|
| 42 |
+
targets=[
|
| 43 |
+
Target(
|
| 44 |
+
expr='sum(rate(ray_serve_deployment_error_counter_total{{route=~"$Route",route!~"/-/.*",{global_filters}}}[5m])) by (application, deployment, replica)',
|
| 45 |
+
legend="{{replica}}",
|
| 46 |
+
),
|
| 47 |
+
],
|
| 48 |
+
grid_pos=GridPos(16, 0, 8, 8),
|
| 49 |
+
),
|
| 50 |
+
Panel(
|
| 51 |
+
id=4,
|
| 52 |
+
title="P50 latency per replica",
|
| 53 |
+
description="P50 latency per replica.",
|
| 54 |
+
unit="ms",
|
| 55 |
+
targets=[
|
| 56 |
+
Target(
|
| 57 |
+
expr='histogram_quantile(0.5, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{{route=~"$Route",route!~"/-/.*",{global_filters}}}[5m])) by (application, deployment, replica, le))',
|
| 58 |
+
legend="{{replica}}",
|
| 59 |
+
),
|
| 60 |
+
Target(
|
| 61 |
+
expr='histogram_quantile(0.5, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{{route=~"$Route",route!~"/-/.*",{global_filters}}}[5m])) by (le))',
|
| 62 |
+
legend="Total",
|
| 63 |
+
),
|
| 64 |
+
],
|
| 65 |
+
fill=0,
|
| 66 |
+
stack=False,
|
| 67 |
+
grid_pos=GridPos(0, 1, 8, 8),
|
| 68 |
+
),
|
| 69 |
+
Panel(
|
| 70 |
+
id=5,
|
| 71 |
+
title="P90 latency per replica",
|
| 72 |
+
description="P90 latency per replica.",
|
| 73 |
+
unit="ms",
|
| 74 |
+
targets=[
|
| 75 |
+
Target(
|
| 76 |
+
expr='histogram_quantile(0.9, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{{route=~"$Route",route!~"/-/.*",{global_filters}}}[5m])) by (application, deployment, replica, le))',
|
| 77 |
+
legend="{{replica}}",
|
| 78 |
+
),
|
| 79 |
+
Target(
|
| 80 |
+
expr='histogram_quantile(0.9, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{{route=~"$Route",route!~"/-/.*",{global_filters}}}[5m])) by (le))',
|
| 81 |
+
legend="Total",
|
| 82 |
+
),
|
| 83 |
+
],
|
| 84 |
+
fill=0,
|
| 85 |
+
stack=False,
|
| 86 |
+
grid_pos=GridPos(8, 1, 8, 8),
|
| 87 |
+
),
|
| 88 |
+
Panel(
|
| 89 |
+
id=6,
|
| 90 |
+
title="P99 latency per replica",
|
| 91 |
+
description="P99 latency per replica.",
|
| 92 |
+
unit="ms",
|
| 93 |
+
targets=[
|
| 94 |
+
Target(
|
| 95 |
+
expr='histogram_quantile(0.99, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{{route=~"$Route",route!~"/-/.*",{global_filters}}}[5m])) by (application, deployment, replica, le))',
|
| 96 |
+
legend="{{replica}}",
|
| 97 |
+
),
|
| 98 |
+
Target(
|
| 99 |
+
expr='histogram_quantile(0.99, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{{route=~"$Route",{global_filters}}}[5m])) by (le))',
|
| 100 |
+
legend="Total",
|
| 101 |
+
),
|
| 102 |
+
],
|
| 103 |
+
fill=0,
|
| 104 |
+
stack=False,
|
| 105 |
+
grid_pos=GridPos(16, 1, 8, 8),
|
| 106 |
+
),
|
| 107 |
+
Panel(
|
| 108 |
+
id=7,
|
| 109 |
+
title="Queue size per deployment",
|
| 110 |
+
description='Number of requests queued per deployment. Ignores "Replica" and "Route" variable.',
|
| 111 |
+
unit="requests",
|
| 112 |
+
targets=[
|
| 113 |
+
Target(
|
| 114 |
+
expr="sum(ray_serve_deployment_queued_queries{{{global_filters}}}) by (application, deployment)",
|
| 115 |
+
legend="{{application, deployment}}",
|
| 116 |
+
),
|
| 117 |
+
],
|
| 118 |
+
fill=0,
|
| 119 |
+
stack=False,
|
| 120 |
+
grid_pos=GridPos(0, 2, 12, 8),
|
| 121 |
+
),
|
| 122 |
+
Panel(
|
| 123 |
+
id=8,
|
| 124 |
+
title="Running requests per replica",
|
| 125 |
+
description="Current running requests for each replica.",
|
| 126 |
+
unit="requests",
|
| 127 |
+
targets=[
|
| 128 |
+
Target(
|
| 129 |
+
expr="sum(ray_serve_replica_processing_queries{{{global_filters}}}) by (application, deployment, replica)",
|
| 130 |
+
legend="{{replica}}",
|
| 131 |
+
),
|
| 132 |
+
],
|
| 133 |
+
fill=0,
|
| 134 |
+
stack=False,
|
| 135 |
+
grid_pos=GridPos(12, 2, 12, 8),
|
| 136 |
+
),
|
| 137 |
+
Panel(
|
| 138 |
+
id=9,
|
| 139 |
+
title="Multiplexed models per replica",
|
| 140 |
+
description="The number of multiplexed models for each replica.",
|
| 141 |
+
unit="models",
|
| 142 |
+
targets=[
|
| 143 |
+
Target(
|
| 144 |
+
expr="sum(ray_serve_num_multiplexed_models{{{global_filters}}}) by (application, deployment, replica)",
|
| 145 |
+
legend="{{replica}}",
|
| 146 |
+
),
|
| 147 |
+
],
|
| 148 |
+
fill=0,
|
| 149 |
+
stack=False,
|
| 150 |
+
grid_pos=GridPos(0, 3, 8, 8),
|
| 151 |
+
),
|
| 152 |
+
Panel(
|
| 153 |
+
id=10,
|
| 154 |
+
title="Multiplexed model loads per replica",
|
| 155 |
+
description="The number of times of multiplexed models loaded for each replica.",
|
| 156 |
+
unit="times",
|
| 157 |
+
targets=[
|
| 158 |
+
Target(
|
| 159 |
+
expr="sum(ray_serve_multiplexed_models_load_counter_total{{{global_filters}}}) by (application, deployment, replica)",
|
| 160 |
+
legend="{{replica}}",
|
| 161 |
+
),
|
| 162 |
+
],
|
| 163 |
+
fill=0,
|
| 164 |
+
stack=False,
|
| 165 |
+
grid_pos=GridPos(8, 3, 8, 8),
|
| 166 |
+
),
|
| 167 |
+
Panel(
|
| 168 |
+
id=11,
|
| 169 |
+
title="Multiplexed model unloads per replica",
|
| 170 |
+
description="The number of times of multiplexed models unloaded for each replica.",
|
| 171 |
+
unit="times",
|
| 172 |
+
targets=[
|
| 173 |
+
Target(
|
| 174 |
+
expr="sum(ray_serve_multiplexed_models_unload_counter_total{{{global_filters}}}) by (application, deployment, replica)",
|
| 175 |
+
legend="{{replica}}",
|
| 176 |
+
),
|
| 177 |
+
],
|
| 178 |
+
fill=0,
|
| 179 |
+
stack=False,
|
| 180 |
+
grid_pos=GridPos(16, 3, 8, 8),
|
| 181 |
+
),
|
| 182 |
+
Panel(
|
| 183 |
+
id=12,
|
| 184 |
+
title="P99 latency of multiplexed model loads per replica",
|
| 185 |
+
description="P99 latency of mutliplexed model load per replica.",
|
| 186 |
+
unit="ms",
|
| 187 |
+
targets=[
|
| 188 |
+
Target(
|
| 189 |
+
expr="histogram_quantile(0.99, sum(rate(ray_serve_multiplexed_model_load_latency_ms_bucket{{{global_filters}}}[5m])) by (application, deployment, replica, le))",
|
| 190 |
+
legend="{{replica}}",
|
| 191 |
+
),
|
| 192 |
+
],
|
| 193 |
+
fill=0,
|
| 194 |
+
stack=False,
|
| 195 |
+
grid_pos=GridPos(0, 4, 8, 8),
|
| 196 |
+
),
|
| 197 |
+
Panel(
|
| 198 |
+
id=13,
|
| 199 |
+
title="P99 latency of multiplexed model unloads per replica",
|
| 200 |
+
description="P99 latency of mutliplexed model unload per replica.",
|
| 201 |
+
unit="ms",
|
| 202 |
+
targets=[
|
| 203 |
+
Target(
|
| 204 |
+
expr="histogram_quantile(0.99, sum(rate(ray_serve_multiplexed_model_unload_latency_ms_bucket{{{global_filters}}}[5m])) by (application, deployment, replica, le))",
|
| 205 |
+
legend="{{replica}}",
|
| 206 |
+
),
|
| 207 |
+
],
|
| 208 |
+
fill=0,
|
| 209 |
+
stack=False,
|
| 210 |
+
grid_pos=GridPos(8, 4, 8, 8),
|
| 211 |
+
),
|
| 212 |
+
Panel(
|
| 213 |
+
id=14,
|
| 214 |
+
title="Multiplexed model ids per replica",
|
| 215 |
+
description="The ids of multiplexed models for each replica.",
|
| 216 |
+
unit="model",
|
| 217 |
+
targets=[
|
| 218 |
+
Target(
|
| 219 |
+
expr="ray_serve_registered_multiplexed_model_id{{{global_filters}}}",
|
| 220 |
+
legend="{{replica}}:{{model_id}}",
|
| 221 |
+
),
|
| 222 |
+
],
|
| 223 |
+
grid_pos=GridPos(16, 4, 8, 8),
|
| 224 |
+
stack=False,
|
| 225 |
+
),
|
| 226 |
+
Panel(
|
| 227 |
+
id=15,
|
| 228 |
+
title="Multiplexed model cache hit rate",
|
| 229 |
+
description="The cache hit rate of multiplexed models for the deployment.",
|
| 230 |
+
unit="%",
|
| 231 |
+
targets=[
|
| 232 |
+
Target(
|
| 233 |
+
expr="(1 - sum(rate(ray_serve_multiplexed_models_load_counter_total{{{global_filters}}}[5m]))/sum(rate(ray_serve_multiplexed_get_model_requests_counter_total{{{global_filters}}}[5m])))",
|
| 234 |
+
legend="{{replica}}",
|
| 235 |
+
),
|
| 236 |
+
],
|
| 237 |
+
grid_pos=GridPos(0, 5, 8, 8),
|
| 238 |
+
),
|
| 239 |
+
]
|
| 240 |
+
|
| 241 |
+
ids = []
|
| 242 |
+
for panel in SERVE_DEPLOYMENT_GRAFANA_PANELS:
|
| 243 |
+
ids.append(panel.id)
|
| 244 |
+
assert len(ids) == len(
|
| 245 |
+
set(ids)
|
| 246 |
+
), f"Duplicated id found. Use unique id for each panel. {ids}"
|
| 247 |
+
|
| 248 |
+
serve_deployment_dashboard_config = DashboardConfig(
|
| 249 |
+
name="SERVE_DEPLOYMENT",
|
| 250 |
+
default_uid="rayServeDeploymentDashboard",
|
| 251 |
+
panels=SERVE_DEPLOYMENT_GRAFANA_PANELS,
|
| 252 |
+
standard_global_filters=[
|
| 253 |
+
'application=~"$Application"',
|
| 254 |
+
'deployment=~"$Deployment"',
|
| 255 |
+
'replica=~"$Replica"',
|
| 256 |
+
'ray_io_cluster=~"$Cluster"',
|
| 257 |
+
],
|
| 258 |
+
base_json_file_name="serve_deployment_grafana_dashboard_base.json",
|
| 259 |
+
)
|
.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/dashboards/serve_deployment_grafana_dashboard_base.json
ADDED
|
@@ -0,0 +1,223 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"annotations": {
|
| 3 |
+
"list": [
|
| 4 |
+
{
|
| 5 |
+
"builtIn": 1,
|
| 6 |
+
"datasource": "-- Grafana --",
|
| 7 |
+
"enable": true,
|
| 8 |
+
"hide": true,
|
| 9 |
+
"iconColor": "rgba(0, 211, 255, 1)",
|
| 10 |
+
"name": "Annotations & Alerts",
|
| 11 |
+
"type": "dashboard"
|
| 12 |
+
}
|
| 13 |
+
]
|
| 14 |
+
},
|
| 15 |
+
"editable": true,
|
| 16 |
+
"gnetId": null,
|
| 17 |
+
"graphTooltip": 0,
|
| 18 |
+
"iteration": 1667344411089,
|
| 19 |
+
"links": [],
|
| 20 |
+
"panels": [],
|
| 21 |
+
"refresh": false,
|
| 22 |
+
"schemaVersion": 27,
|
| 23 |
+
"style": "dark",
|
| 24 |
+
"tags": [],
|
| 25 |
+
"templating": {
|
| 26 |
+
"list": [
|
| 27 |
+
{
|
| 28 |
+
"current": {
|
| 29 |
+
"selected": false
|
| 30 |
+
},
|
| 31 |
+
"description": "Filter queries to specific prometheus type.",
|
| 32 |
+
"hide": 2,
|
| 33 |
+
"includeAll": false,
|
| 34 |
+
"multi": false,
|
| 35 |
+
"name": "datasource",
|
| 36 |
+
"options": [],
|
| 37 |
+
"query": "prometheus",
|
| 38 |
+
"refresh": 1,
|
| 39 |
+
"regex": "",
|
| 40 |
+
"skipUrlSync": false,
|
| 41 |
+
"type": "datasource"
|
| 42 |
+
},
|
| 43 |
+
{
|
| 44 |
+
"allValue": ".*",
|
| 45 |
+
"current": {
|
| 46 |
+
"selected": true,
|
| 47 |
+
"text": [
|
| 48 |
+
"All"
|
| 49 |
+
],
|
| 50 |
+
"value": [
|
| 51 |
+
"$__all"
|
| 52 |
+
]
|
| 53 |
+
},
|
| 54 |
+
"datasource": "${datasource}",
|
| 55 |
+
"definition": "label_values(ray_serve_deployment_replica_healthy{{{global_filters}}}, application)",
|
| 56 |
+
"description": null,
|
| 57 |
+
"error": null,
|
| 58 |
+
"hide": 0,
|
| 59 |
+
"includeAll": true,
|
| 60 |
+
"label": null,
|
| 61 |
+
"multi": true,
|
| 62 |
+
"name": "Application",
|
| 63 |
+
"options": [],
|
| 64 |
+
"query": {
|
| 65 |
+
"query": "label_values(ray_serve_deployment_replica_healthy{{{global_filters}}}, application)",
|
| 66 |
+
"refId": "Prometheus-Instance-Variable-Query"
|
| 67 |
+
},
|
| 68 |
+
"refresh": 2,
|
| 69 |
+
"regex": "",
|
| 70 |
+
"skipUrlSync": false,
|
| 71 |
+
"sort": 0,
|
| 72 |
+
"tagValuesQuery": "",
|
| 73 |
+
"tags": [],
|
| 74 |
+
"tagsQuery": "",
|
| 75 |
+
"type": "query",
|
| 76 |
+
"useTags": false
|
| 77 |
+
},
|
| 78 |
+
{
|
| 79 |
+
"allValue": ".*",
|
| 80 |
+
"current": {
|
| 81 |
+
"selected": true,
|
| 82 |
+
"text": [
|
| 83 |
+
"All"
|
| 84 |
+
],
|
| 85 |
+
"value": [
|
| 86 |
+
"$__all"
|
| 87 |
+
]
|
| 88 |
+
},
|
| 89 |
+
"datasource": "${datasource}",
|
| 90 |
+
"definition": "label_values(ray_serve_deployment_replica_healthy{{application=~\"$Application\",{global_filters}}}, deployment)",
|
| 91 |
+
"description": null,
|
| 92 |
+
"error": null,
|
| 93 |
+
"hide": 0,
|
| 94 |
+
"includeAll": true,
|
| 95 |
+
"label": null,
|
| 96 |
+
"multi": true,
|
| 97 |
+
"name": "Deployment",
|
| 98 |
+
"options": [],
|
| 99 |
+
"query": {
|
| 100 |
+
"query": "label_values(ray_serve_deployment_replica_healthy{{application=~\"$Application\",{global_filters}}}, deployment)",
|
| 101 |
+
"refId": "Prometheus-Instance-Variable-Query"
|
| 102 |
+
},
|
| 103 |
+
"refresh": 2,
|
| 104 |
+
"regex": "",
|
| 105 |
+
"skipUrlSync": false,
|
| 106 |
+
"sort": 0,
|
| 107 |
+
"tagValuesQuery": "",
|
| 108 |
+
"tags": [],
|
| 109 |
+
"tagsQuery": "",
|
| 110 |
+
"type": "query",
|
| 111 |
+
"useTags": false
|
| 112 |
+
},
|
| 113 |
+
{
|
| 114 |
+
"allValue": ".*",
|
| 115 |
+
"current": {
|
| 116 |
+
"selected": true,
|
| 117 |
+
"text": [
|
| 118 |
+
"All"
|
| 119 |
+
],
|
| 120 |
+
"value": [
|
| 121 |
+
"$__all"
|
| 122 |
+
]
|
| 123 |
+
},
|
| 124 |
+
"datasource": "${datasource}",
|
| 125 |
+
"definition": "label_values(ray_serve_deployment_replica_healthy{{application=~\"$Application\",deployment=~\"$Deployment\",{global_filters}}}, replica)",
|
| 126 |
+
"description": null,
|
| 127 |
+
"error": null,
|
| 128 |
+
"hide": 0,
|
| 129 |
+
"includeAll": true,
|
| 130 |
+
"label": null,
|
| 131 |
+
"multi": true,
|
| 132 |
+
"name": "Replica",
|
| 133 |
+
"options": [],
|
| 134 |
+
"query": {
|
| 135 |
+
"query": "label_values(ray_serve_deployment_replica_healthy{{application=~\"$Application\",deployment=~\"$Deployment\",{global_filters}}}, replica)",
|
| 136 |
+
"refId": "Prometheus-Instance-Variable-Query"
|
| 137 |
+
},
|
| 138 |
+
"refresh": 2,
|
| 139 |
+
"regex": "",
|
| 140 |
+
"skipUrlSync": false,
|
| 141 |
+
"sort": 0,
|
| 142 |
+
"tagValuesQuery": "",
|
| 143 |
+
"tags": [],
|
| 144 |
+
"tagsQuery": "",
|
| 145 |
+
"type": "query",
|
| 146 |
+
"useTags": false
|
| 147 |
+
},
|
| 148 |
+
{
|
| 149 |
+
"allValue": ".*",
|
| 150 |
+
"current": {
|
| 151 |
+
"selected": true,
|
| 152 |
+
"text": [
|
| 153 |
+
"All"
|
| 154 |
+
],
|
| 155 |
+
"value": [
|
| 156 |
+
"$__all"
|
| 157 |
+
]
|
| 158 |
+
},
|
| 159 |
+
"datasource": "${datasource}",
|
| 160 |
+
"definition": "label_values(ray_serve_deployment_request_counter{{deployment=~\"$Deployment\",{global_filters}}}, route)",
|
| 161 |
+
"description": null,
|
| 162 |
+
"error": null,
|
| 163 |
+
"hide": 0,
|
| 164 |
+
"includeAll": true,
|
| 165 |
+
"label": null,
|
| 166 |
+
"multi": true,
|
| 167 |
+
"name": "Route",
|
| 168 |
+
"options": [],
|
| 169 |
+
"query": {
|
| 170 |
+
"query": "label_values(ray_serve_deployment_request_counter{{deployment=~\"$Deployment\",{global_filters}}}, route)",
|
| 171 |
+
"refId": "Prometheus-Instance-Variable-Query"
|
| 172 |
+
},
|
| 173 |
+
"refresh": 2,
|
| 174 |
+
"regex": "",
|
| 175 |
+
"skipUrlSync": false,
|
| 176 |
+
"sort": 0,
|
| 177 |
+
"tagValuesQuery": "",
|
| 178 |
+
"tags": [],
|
| 179 |
+
"tagsQuery": "",
|
| 180 |
+
"type": "query",
|
| 181 |
+
"useTags": false
|
| 182 |
+
},
|
| 183 |
+
{
|
| 184 |
+
"current": {
|
| 185 |
+
"selected": false
|
| 186 |
+
},
|
| 187 |
+
"datasource": "${datasource}",
|
| 188 |
+
"definition": "label_values(ray_node_network_receive_speed{{{global_filters}}}, ray_io_cluster)",
|
| 189 |
+
"description": "Filter queries to specific Ray clusters for KubeRay. When ingesting metrics across multiple ray clusters, the ray_io_cluster label should be set per cluster. For KubeRay users, this is done automaticaly with Prometheus PodMonitor.",
|
| 190 |
+
"error": null,
|
| 191 |
+
"hide": 0,
|
| 192 |
+
"includeAll": false,
|
| 193 |
+
"label": null,
|
| 194 |
+
"multi": false,
|
| 195 |
+
"name": "Cluster",
|
| 196 |
+
"options": [],
|
| 197 |
+
"query": {
|
| 198 |
+
"query": "label_values(ray_node_network_receive_speed{{{global_filters}}}, ray_io_cluster)",
|
| 199 |
+
"refId": "StandardVariableQuery"
|
| 200 |
+
},
|
| 201 |
+
"refresh": 2,
|
| 202 |
+
"regex": "",
|
| 203 |
+
"skipUrlSync": false,
|
| 204 |
+
"sort": 2,
|
| 205 |
+
"tagValuesQuery": "",
|
| 206 |
+
"tags": [],
|
| 207 |
+
"tagsQuery": "",
|
| 208 |
+
"type": "query",
|
| 209 |
+
"useTags": false
|
| 210 |
+
}
|
| 211 |
+
]
|
| 212 |
+
},
|
| 213 |
+
"rayMeta": ["excludesSystemRoutes"],
|
| 214 |
+
"time": {
|
| 215 |
+
"from": "now-30m",
|
| 216 |
+
"to": "now"
|
| 217 |
+
},
|
| 218 |
+
"timepicker": {},
|
| 219 |
+
"timezone": "",
|
| 220 |
+
"title": "Serve Deployment Dashboard",
|
| 221 |
+
"uid": "rayServeDeploymentDashboard",
|
| 222 |
+
"version": 1
|
| 223 |
+
}
|
.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/dashboards/serve_grafana_dashboard_base.json
ADDED
|
@@ -0,0 +1,188 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"annotations": {
|
| 3 |
+
"list": [
|
| 4 |
+
{
|
| 5 |
+
"builtIn": 1,
|
| 6 |
+
"datasource": "-- Grafana --",
|
| 7 |
+
"enable": true,
|
| 8 |
+
"hide": true,
|
| 9 |
+
"iconColor": "rgba(0, 211, 255, 1)",
|
| 10 |
+
"name": "Annotations & Alerts",
|
| 11 |
+
"type": "dashboard"
|
| 12 |
+
}
|
| 13 |
+
]
|
| 14 |
+
},
|
| 15 |
+
"editable": true,
|
| 16 |
+
"gnetId": null,
|
| 17 |
+
"graphTooltip": 0,
|
| 18 |
+
"iteration": 1667344411089,
|
| 19 |
+
"links": [],
|
| 20 |
+
"panels": [],
|
| 21 |
+
"refresh": false,
|
| 22 |
+
"schemaVersion": 27,
|
| 23 |
+
"style": "dark",
|
| 24 |
+
"tags": [],
|
| 25 |
+
"templating": {
|
| 26 |
+
"list": [
|
| 27 |
+
{
|
| 28 |
+
"current": {
|
| 29 |
+
"selected": false
|
| 30 |
+
},
|
| 31 |
+
"description": "Filter queries of a specific Prometheus type.",
|
| 32 |
+
"hide": 2,
|
| 33 |
+
"includeAll": false,
|
| 34 |
+
"multi": false,
|
| 35 |
+
"name": "datasource",
|
| 36 |
+
"options": [],
|
| 37 |
+
"query": "prometheus",
|
| 38 |
+
"refresh": 1,
|
| 39 |
+
"regex": "",
|
| 40 |
+
"skipUrlSync": false,
|
| 41 |
+
"type": "datasource"
|
| 42 |
+
},
|
| 43 |
+
{
|
| 44 |
+
"allValue": ".*",
|
| 45 |
+
"current": {
|
| 46 |
+
"selected": true,
|
| 47 |
+
"text": [
|
| 48 |
+
"All"
|
| 49 |
+
],
|
| 50 |
+
"value": [
|
| 51 |
+
"$__all"
|
| 52 |
+
]
|
| 53 |
+
},
|
| 54 |
+
"datasource": "${datasource}",
|
| 55 |
+
"definition": "label_values(ray_serve_deployment_replica_healthy{{{global_filters}}}, application)",
|
| 56 |
+
"description": null,
|
| 57 |
+
"error": null,
|
| 58 |
+
"hide": 0,
|
| 59 |
+
"includeAll": true,
|
| 60 |
+
"label": null,
|
| 61 |
+
"multi": true,
|
| 62 |
+
"name": "Application",
|
| 63 |
+
"options": [],
|
| 64 |
+
"query": {
|
| 65 |
+
"query": "label_values(ray_serve_deployment_replica_healthy{{{global_filters}}}, application)",
|
| 66 |
+
"refId": "Prometheus-Instance-Variable-Query"
|
| 67 |
+
},
|
| 68 |
+
"refresh": 2,
|
| 69 |
+
"regex": "",
|
| 70 |
+
"skipUrlSync": false,
|
| 71 |
+
"sort": 0,
|
| 72 |
+
"tagValuesQuery": "",
|
| 73 |
+
"tags": [],
|
| 74 |
+
"tagsQuery": "",
|
| 75 |
+
"type": "query",
|
| 76 |
+
"useTags": false
|
| 77 |
+
},
|
| 78 |
+
{
|
| 79 |
+
"allValue": ".*",
|
| 80 |
+
"current": {
|
| 81 |
+
"selected": true,
|
| 82 |
+
"text": [
|
| 83 |
+
"All"
|
| 84 |
+
],
|
| 85 |
+
"value": [
|
| 86 |
+
"$__all"
|
| 87 |
+
]
|
| 88 |
+
},
|
| 89 |
+
"datasource": "${datasource}",
|
| 90 |
+
"definition": "label_values(ray_serve_num_http_requests_total{{{global_filters}}}, route)",
|
| 91 |
+
"description": null,
|
| 92 |
+
"error": null,
|
| 93 |
+
"hide": 0,
|
| 94 |
+
"includeAll": true,
|
| 95 |
+
"label": "HTTP Route",
|
| 96 |
+
"multi": true,
|
| 97 |
+
"name": "HTTP_Route",
|
| 98 |
+
"options": [],
|
| 99 |
+
"query": {
|
| 100 |
+
"query": "label_values(ray_serve_num_http_requests_total{{{global_filters}}}, route)",
|
| 101 |
+
"refId": "Prometheus-Instance-Variable-Query"
|
| 102 |
+
},
|
| 103 |
+
"refresh": 2,
|
| 104 |
+
"regex": "",
|
| 105 |
+
"skipUrlSync": false,
|
| 106 |
+
"sort": 0,
|
| 107 |
+
"tagValuesQuery": "",
|
| 108 |
+
"tags": [],
|
| 109 |
+
"tagsQuery": "",
|
| 110 |
+
"type": "query",
|
| 111 |
+
"useTags": false
|
| 112 |
+
},
|
| 113 |
+
{
|
| 114 |
+
"allValue": ".*",
|
| 115 |
+
"current": {
|
| 116 |
+
"selected": true,
|
| 117 |
+
"text": [
|
| 118 |
+
"All"
|
| 119 |
+
],
|
| 120 |
+
"value": [
|
| 121 |
+
"$__all"
|
| 122 |
+
]
|
| 123 |
+
},
|
| 124 |
+
"datasource": "${datasource}",
|
| 125 |
+
"definition": "label_values(ray_serve_num_grpc_requests{{{global_filters}}}, method)",
|
| 126 |
+
"description": null,
|
| 127 |
+
"error": null,
|
| 128 |
+
"hide": 0,
|
| 129 |
+
"includeAll": true,
|
| 130 |
+
"label": "gRPC Service Method",
|
| 131 |
+
"multi": true,
|
| 132 |
+
"name": "gRPC_Method",
|
| 133 |
+
"options": [],
|
| 134 |
+
"query": {
|
| 135 |
+
"query": "label_values(ray_serve_num_grpc_requests{{{global_filters}}}, method)",
|
| 136 |
+
"refId": "Prometheus-Instance-Variable-Query"
|
| 137 |
+
},
|
| 138 |
+
"refresh": 2,
|
| 139 |
+
"regex": "",
|
| 140 |
+
"skipUrlSync": false,
|
| 141 |
+
"sort": 0,
|
| 142 |
+
"tagValuesQuery": "",
|
| 143 |
+
"tags": [],
|
| 144 |
+
"tagsQuery": "",
|
| 145 |
+
"type": "query",
|
| 146 |
+
"useTags": false
|
| 147 |
+
},
|
| 148 |
+
{
|
| 149 |
+
"current": {
|
| 150 |
+
"selected": false
|
| 151 |
+
},
|
| 152 |
+
"datasource": "${datasource}",
|
| 153 |
+
"definition": "label_values(ray_node_network_receive_speed{{{global_filters}}}, ray_io_cluster)",
|
| 154 |
+
"description": "Filter queries to specific Ray clusters for KubeRay. When ingesting metrics across multiple ray clusters, the ray_io_cluster label should be set per cluster. For KubeRay users, this is done automaticaly with Prometheus PodMonitor.",
|
| 155 |
+
"error": null,
|
| 156 |
+
"hide": 0,
|
| 157 |
+
"includeAll": false,
|
| 158 |
+
"label": null,
|
| 159 |
+
"multi": false,
|
| 160 |
+
"name": "Cluster",
|
| 161 |
+
"options": [],
|
| 162 |
+
"query": {
|
| 163 |
+
"query": "label_values(ray_node_network_receive_speed{{{global_filters}}}, ray_io_cluster)",
|
| 164 |
+
"refId": "StandardVariableQuery"
|
| 165 |
+
},
|
| 166 |
+
"refresh": 2,
|
| 167 |
+
"regex": "",
|
| 168 |
+
"skipUrlSync": false,
|
| 169 |
+
"sort": 2,
|
| 170 |
+
"tagValuesQuery": "",
|
| 171 |
+
"tags": [],
|
| 172 |
+
"tagsQuery": "",
|
| 173 |
+
"type": "query",
|
| 174 |
+
"useTags": false
|
| 175 |
+
}
|
| 176 |
+
]
|
| 177 |
+
},
|
| 178 |
+
"rayMeta": ["excludesSystemRoutes"],
|
| 179 |
+
"time": {
|
| 180 |
+
"from": "now-30m",
|
| 181 |
+
"to": "now"
|
| 182 |
+
},
|
| 183 |
+
"timepicker": {},
|
| 184 |
+
"timezone": "",
|
| 185 |
+
"title": "Serve Dashboard",
|
| 186 |
+
"uid": "rayServeDashboard",
|
| 187 |
+
"version": 1
|
| 188 |
+
}
|
.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/export/prometheus/prometheus.yml
ADDED
|
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# my global config
|
| 2 |
+
global:
|
| 3 |
+
scrape_interval: 10s # Set the scrape interval to every 10 seconds. Default is every 1 minute.
|
| 4 |
+
evaluation_interval: 10s # Evaluate rules every 10 seconds. The default is every 1 minute.
|
| 5 |
+
# scrape_timeout is set to the global default (10s).
|
| 6 |
+
|
| 7 |
+
scrape_configs:
|
| 8 |
+
# Scrape from each Ray node as defined in the service_discovery.json provided by Ray.
|
| 9 |
+
- job_name: 'ray'
|
| 10 |
+
file_sd_configs:
|
| 11 |
+
- files:
|
| 12 |
+
- '/tmp/ray/prom_metrics_service_discovery.json'
|
.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/grafana_dashboard_factory.py
ADDED
|
@@ -0,0 +1,301 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import copy
|
| 2 |
+
import json
|
| 3 |
+
import os
|
| 4 |
+
from dataclasses import asdict
|
| 5 |
+
from typing import List, Tuple
|
| 6 |
+
|
| 7 |
+
import ray
|
| 8 |
+
from ray.dashboard.modules.metrics.dashboards.common import DashboardConfig, Panel
|
| 9 |
+
from ray.dashboard.modules.metrics.dashboards.data_dashboard_panels import (
|
| 10 |
+
data_dashboard_config,
|
| 11 |
+
)
|
| 12 |
+
from ray.dashboard.modules.metrics.dashboards.default_dashboard_panels import (
|
| 13 |
+
default_dashboard_config,
|
| 14 |
+
)
|
| 15 |
+
from ray.dashboard.modules.metrics.dashboards.serve_dashboard_panels import (
|
| 16 |
+
serve_dashboard_config,
|
| 17 |
+
)
|
| 18 |
+
from ray.dashboard.modules.metrics.dashboards.serve_deployment_dashboard_panels import (
|
| 19 |
+
serve_deployment_dashboard_config,
|
| 20 |
+
)
|
| 21 |
+
|
| 22 |
+
GRAFANA_DASHBOARD_UID_OVERRIDE_ENV_VAR_TEMPLATE = "RAY_GRAFANA_{name}_DASHBOARD_UID"
|
| 23 |
+
GRAFANA_DASHBOARD_GLOBAL_FILTERS_OVERRIDE_ENV_VAR_TEMPLATE = (
|
| 24 |
+
"RAY_GRAFANA_{name}_DASHBOARD_GLOBAL_FILTERS"
|
| 25 |
+
)
|
| 26 |
+
|
| 27 |
+
TARGET_TEMPLATE = {
|
| 28 |
+
"exemplar": True,
|
| 29 |
+
"expr": "0",
|
| 30 |
+
"interval": "",
|
| 31 |
+
"legendFormat": "",
|
| 32 |
+
"queryType": "randomWalk",
|
| 33 |
+
"refId": "A",
|
| 34 |
+
}
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
PANEL_TEMPLATE = {
|
| 38 |
+
"aliasColors": {},
|
| 39 |
+
"bars": False,
|
| 40 |
+
"dashLength": 10,
|
| 41 |
+
"dashes": False,
|
| 42 |
+
"datasource": r"${datasource}",
|
| 43 |
+
"description": "<Description>",
|
| 44 |
+
"fieldConfig": {"defaults": {}, "overrides": []},
|
| 45 |
+
"fill": 10,
|
| 46 |
+
"fillGradient": 0,
|
| 47 |
+
"gridPos": {"h": 8, "w": 12, "x": 0, "y": 0},
|
| 48 |
+
"hiddenSeries": False,
|
| 49 |
+
"id": 26,
|
| 50 |
+
"legend": {
|
| 51 |
+
"alignAsTable": True,
|
| 52 |
+
"avg": False,
|
| 53 |
+
"current": True,
|
| 54 |
+
"hideEmpty": False,
|
| 55 |
+
"hideZero": True,
|
| 56 |
+
"max": False,
|
| 57 |
+
"min": False,
|
| 58 |
+
"rightSide": False,
|
| 59 |
+
"show": True,
|
| 60 |
+
"sort": "current",
|
| 61 |
+
"sortDesc": True,
|
| 62 |
+
"total": False,
|
| 63 |
+
"values": True,
|
| 64 |
+
},
|
| 65 |
+
"lines": True,
|
| 66 |
+
"linewidth": 1,
|
| 67 |
+
"nullPointMode": "null",
|
| 68 |
+
"options": {"alertThreshold": True},
|
| 69 |
+
"percentage": False,
|
| 70 |
+
"pluginVersion": "7.5.17",
|
| 71 |
+
"pointradius": 2,
|
| 72 |
+
"points": False,
|
| 73 |
+
"renderer": "flot",
|
| 74 |
+
"seriesOverrides": [
|
| 75 |
+
{
|
| 76 |
+
"$$hashKey": "object:2987",
|
| 77 |
+
"alias": "MAX",
|
| 78 |
+
"dashes": True,
|
| 79 |
+
"color": "#1F60C4",
|
| 80 |
+
"fill": 0,
|
| 81 |
+
"stack": False,
|
| 82 |
+
},
|
| 83 |
+
{
|
| 84 |
+
"$$hashKey": "object:78",
|
| 85 |
+
"alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/",
|
| 86 |
+
"hiddenSeries": True,
|
| 87 |
+
},
|
| 88 |
+
{
|
| 89 |
+
"$$hashKey": "object:2987",
|
| 90 |
+
"alias": "MAX + PENDING",
|
| 91 |
+
"dashes": True,
|
| 92 |
+
"color": "#777777",
|
| 93 |
+
"fill": 0,
|
| 94 |
+
"stack": False,
|
| 95 |
+
},
|
| 96 |
+
],
|
| 97 |
+
"spaceLength": 10,
|
| 98 |
+
"stack": True,
|
| 99 |
+
"steppedLine": False,
|
| 100 |
+
"targets": [],
|
| 101 |
+
"thresholds": [],
|
| 102 |
+
"timeFrom": None,
|
| 103 |
+
"timeRegions": [],
|
| 104 |
+
"timeShift": None,
|
| 105 |
+
"title": "<Title>",
|
| 106 |
+
"tooltip": {"shared": True, "sort": 0, "value_type": "individual"},
|
| 107 |
+
"type": "graph",
|
| 108 |
+
"xaxis": {
|
| 109 |
+
"buckets": None,
|
| 110 |
+
"mode": "time",
|
| 111 |
+
"name": None,
|
| 112 |
+
"show": True,
|
| 113 |
+
"values": [],
|
| 114 |
+
},
|
| 115 |
+
"yaxes": [
|
| 116 |
+
{
|
| 117 |
+
"$$hashKey": "object:628",
|
| 118 |
+
"format": "units",
|
| 119 |
+
"label": "",
|
| 120 |
+
"logBase": 1,
|
| 121 |
+
"max": None,
|
| 122 |
+
"min": "0",
|
| 123 |
+
"show": True,
|
| 124 |
+
},
|
| 125 |
+
{
|
| 126 |
+
"$$hashKey": "object:629",
|
| 127 |
+
"format": "short",
|
| 128 |
+
"label": None,
|
| 129 |
+
"logBase": 1,
|
| 130 |
+
"max": None,
|
| 131 |
+
"min": None,
|
| 132 |
+
"show": True,
|
| 133 |
+
},
|
| 134 |
+
],
|
| 135 |
+
"yaxis": {"align": False, "alignLevel": None},
|
| 136 |
+
}
|
| 137 |
+
|
| 138 |
+
|
| 139 |
+
def _read_configs_for_dashboard(
|
| 140 |
+
dashboard_config: DashboardConfig,
|
| 141 |
+
) -> Tuple[str, List[str]]:
|
| 142 |
+
"""
|
| 143 |
+
Reads environment variable configs for overriding uid or global_filters for a given
|
| 144 |
+
dashboard.
|
| 145 |
+
|
| 146 |
+
Returns:
|
| 147 |
+
Tuple with format uid, global_filters
|
| 148 |
+
"""
|
| 149 |
+
uid = (
|
| 150 |
+
os.environ.get(
|
| 151 |
+
GRAFANA_DASHBOARD_UID_OVERRIDE_ENV_VAR_TEMPLATE.format(
|
| 152 |
+
name=dashboard_config.name
|
| 153 |
+
)
|
| 154 |
+
)
|
| 155 |
+
or dashboard_config.default_uid
|
| 156 |
+
)
|
| 157 |
+
global_filters_str = (
|
| 158 |
+
os.environ.get(
|
| 159 |
+
GRAFANA_DASHBOARD_GLOBAL_FILTERS_OVERRIDE_ENV_VAR_TEMPLATE.format(
|
| 160 |
+
name=dashboard_config.name
|
| 161 |
+
)
|
| 162 |
+
)
|
| 163 |
+
or ""
|
| 164 |
+
)
|
| 165 |
+
global_filters = global_filters_str.split(",")
|
| 166 |
+
|
| 167 |
+
return uid, global_filters
|
| 168 |
+
|
| 169 |
+
|
| 170 |
+
def generate_default_grafana_dashboard() -> Tuple[str, str]:
|
| 171 |
+
"""
|
| 172 |
+
Generates the dashboard output for the default dashboard and returns
|
| 173 |
+
both the content and the uid.
|
| 174 |
+
|
| 175 |
+
Returns:
|
| 176 |
+
Tuple with format content, uid
|
| 177 |
+
"""
|
| 178 |
+
return _generate_grafana_dashboard(default_dashboard_config)
|
| 179 |
+
|
| 180 |
+
|
| 181 |
+
def generate_serve_grafana_dashboard() -> Tuple[str, str]:
|
| 182 |
+
"""
|
| 183 |
+
Generates the dashboard output for the serve dashboard and returns
|
| 184 |
+
both the content and the uid.
|
| 185 |
+
|
| 186 |
+
Returns:
|
| 187 |
+
Tuple with format content, uid
|
| 188 |
+
"""
|
| 189 |
+
return _generate_grafana_dashboard(serve_dashboard_config)
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
def generate_serve_deployment_grafana_dashboard() -> Tuple[str, str]:
|
| 193 |
+
"""
|
| 194 |
+
Generates the dashboard output for the serve dashboard and returns
|
| 195 |
+
both the content and the uid.
|
| 196 |
+
|
| 197 |
+
Returns:
|
| 198 |
+
Tuple with format content, uid
|
| 199 |
+
"""
|
| 200 |
+
return _generate_grafana_dashboard(serve_deployment_dashboard_config)
|
| 201 |
+
|
| 202 |
+
|
| 203 |
+
def generate_data_grafana_dashboard() -> Tuple[str, str]:
|
| 204 |
+
"""
|
| 205 |
+
Generates the dashboard output for the data dashboard and returns
|
| 206 |
+
both the content and the uid.
|
| 207 |
+
|
| 208 |
+
Returns:
|
| 209 |
+
Tuple with format content, uid
|
| 210 |
+
"""
|
| 211 |
+
return _generate_grafana_dashboard(data_dashboard_config)
|
| 212 |
+
|
| 213 |
+
|
| 214 |
+
def _generate_grafana_dashboard(dashboard_config: DashboardConfig) -> str:
|
| 215 |
+
"""
|
| 216 |
+
Returns:
|
| 217 |
+
Tuple with format dashboard_content, uid
|
| 218 |
+
"""
|
| 219 |
+
uid, global_filters = _read_configs_for_dashboard(dashboard_config)
|
| 220 |
+
panels = _generate_grafana_panels(dashboard_config, global_filters)
|
| 221 |
+
base_file_name = dashboard_config.base_json_file_name
|
| 222 |
+
|
| 223 |
+
base_json = json.load(
|
| 224 |
+
open(os.path.join(os.path.dirname(__file__), "dashboards", base_file_name))
|
| 225 |
+
)
|
| 226 |
+
base_json["panels"] = panels
|
| 227 |
+
# Update variables to use global_filters
|
| 228 |
+
global_filters_str = ",".join(global_filters)
|
| 229 |
+
variables = base_json.get("templating", {}).get("list", [])
|
| 230 |
+
for variable in variables:
|
| 231 |
+
if "definition" not in variable:
|
| 232 |
+
continue
|
| 233 |
+
variable["definition"] = variable["definition"].format(
|
| 234 |
+
global_filters=global_filters_str
|
| 235 |
+
)
|
| 236 |
+
variable["query"]["query"] = variable["query"]["query"].format(
|
| 237 |
+
global_filters=global_filters_str
|
| 238 |
+
)
|
| 239 |
+
|
| 240 |
+
tags = base_json.get("tags", []) or []
|
| 241 |
+
tags.append(f"rayVersion:{ray.__version__}")
|
| 242 |
+
base_json["tags"] = tags
|
| 243 |
+
base_json["uid"] = uid
|
| 244 |
+
# Ray metadata can be used to put arbitrary metadata
|
| 245 |
+
ray_meta = base_json.get("rayMeta", []) or []
|
| 246 |
+
ray_meta.append("supportsGlobalFilterOverride")
|
| 247 |
+
base_json["rayMeta"] = ray_meta
|
| 248 |
+
return json.dumps(base_json, indent=4), uid
|
| 249 |
+
|
| 250 |
+
|
| 251 |
+
def _generate_grafana_panels(
|
| 252 |
+
config: DashboardConfig, global_filters: List[str]
|
| 253 |
+
) -> List[dict]:
|
| 254 |
+
out = []
|
| 255 |
+
panel_global_filters = [*config.standard_global_filters, *global_filters]
|
| 256 |
+
for i, panel in enumerate(config.panels):
|
| 257 |
+
template = copy.deepcopy(PANEL_TEMPLATE)
|
| 258 |
+
template.update(
|
| 259 |
+
{
|
| 260 |
+
"title": panel.title,
|
| 261 |
+
"description": panel.description,
|
| 262 |
+
"id": panel.id,
|
| 263 |
+
"targets": _generate_targets(panel, panel_global_filters),
|
| 264 |
+
}
|
| 265 |
+
)
|
| 266 |
+
if panel.grid_pos:
|
| 267 |
+
template["gridPos"] = asdict(panel.grid_pos)
|
| 268 |
+
else:
|
| 269 |
+
template["gridPos"]["y"] = i // 2
|
| 270 |
+
template["gridPos"]["x"] = 12 * (i % 2)
|
| 271 |
+
template["yaxes"][0]["format"] = panel.unit
|
| 272 |
+
template["fill"] = panel.fill
|
| 273 |
+
template["stack"] = panel.stack
|
| 274 |
+
template["linewidth"] = panel.linewidth
|
| 275 |
+
out.append(template)
|
| 276 |
+
return out
|
| 277 |
+
|
| 278 |
+
|
| 279 |
+
def gen_incrementing_alphabets(length):
|
| 280 |
+
assert 65 + length < 96, "we only support up to 26 targets at a time."
|
| 281 |
+
# 65: ascii code of 'A'.
|
| 282 |
+
return list(map(chr, range(65, 65 + length)))
|
| 283 |
+
|
| 284 |
+
|
| 285 |
+
def _generate_targets(panel: Panel, panel_global_filters: List[str]) -> List[dict]:
|
| 286 |
+
targets = []
|
| 287 |
+
for target, ref_id in zip(
|
| 288 |
+
panel.targets, gen_incrementing_alphabets(len(panel.targets))
|
| 289 |
+
):
|
| 290 |
+
template = copy.deepcopy(TARGET_TEMPLATE)
|
| 291 |
+
template.update(
|
| 292 |
+
{
|
| 293 |
+
"expr": target.expr.format(
|
| 294 |
+
global_filters=",".join(panel_global_filters)
|
| 295 |
+
),
|
| 296 |
+
"legendFormat": target.legend,
|
| 297 |
+
"refId": ref_id,
|
| 298 |
+
}
|
| 299 |
+
)
|
| 300 |
+
targets.append(template)
|
| 301 |
+
return targets
|