diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/consts.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/consts.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..08553fdcf2345a843cd2ef1faef00dc7f462f27f Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/consts.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/dashboard.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/dashboard.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..88a9f421046c4148273e3dbd1819d53864c350a1 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/dashboard.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/dashboard_metrics.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/dashboard_metrics.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..990dbf72a7431a3f761ed129188bbe246ba80ca2 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/dashboard_metrics.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/datacenter.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/datacenter.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..726c5fb14537897033f25598a561040c80f8206a Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/datacenter.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/http_server_agent.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/http_server_agent.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b4ce8cae3e3f9bd3771d37e0a892b7eaf308ea0d Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/http_server_agent.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/http_server_head.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/http_server_head.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4ac27114cc9cdf73277d6ec6f224a482e2ffa51e Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/http_server_head.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/k8s_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/k8s_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0bcdc570f8bc3fb674f1988744a49aee187d85da Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/k8s_utils.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/memory_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/memory_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b24c61fb7481e3082b7647193eb386dccc763ded Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/memory_utils.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/optional_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/optional_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..59ccb300f46b6f290edfcab184b2fad3ff9bba28 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/optional_utils.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/routes.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/routes.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d91da78ad8870f80a1191ede43aed0e5d560e2fb Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/routes.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/state_api_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/state_api_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8140898cb8d3b3ab63773fdc252691cbdc7d7e47 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/state_api_utils.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/timezone_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/timezone_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..31261812e54638a7b6bbf21f54b682850e7746fb Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/dashboard/__pycache__/timezone_utils.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/client/build/static/media/roboto-latin-100.c2aa4ab115bf9c6057cb.woff2 b/.venv/lib/python3.11/site-packages/ray/dashboard/client/build/static/media/roboto-latin-100.c2aa4ab115bf9c6057cb.woff2 new file mode 100644 index 0000000000000000000000000000000000000000..a5cc28390adadf9ee63a9ebc965f96a385ccda91 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/dashboard/client/build/static/media/roboto-latin-100.c2aa4ab115bf9c6057cb.woff2 differ diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/client/build/static/media/roboto-latin-100italic.7f839a8652da29745ce4.woff2 b/.venv/lib/python3.11/site-packages/ray/dashboard/client/build/static/media/roboto-latin-100italic.7f839a8652da29745ce4.woff2 new file mode 100644 index 0000000000000000000000000000000000000000..327bebc54e638764cfe22c1efaaabc4de3775211 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/dashboard/client/build/static/media/roboto-latin-100italic.7f839a8652da29745ce4.woff2 differ diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/client/build/static/media/roboto-latin-300.37a7069dc30fc663c878.woff2 b/.venv/lib/python3.11/site-packages/ray/dashboard/client/build/static/media/roboto-latin-300.37a7069dc30fc663c878.woff2 new file mode 100644 index 0000000000000000000000000000000000000000..ef8c8836bd8ce991ece849c894b4e8a214d43ea7 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/dashboard/client/build/static/media/roboto-latin-300.37a7069dc30fc663c878.woff2 differ diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/client/build/static/media/roboto-latin-300italic.bd5b7a13f2c52b531a2a.woff b/.venv/lib/python3.11/site-packages/ray/dashboard/client/build/static/media/roboto-latin-300italic.bd5b7a13f2c52b531a2a.woff new file mode 100644 index 0000000000000000000000000000000000000000..57c12ee03678558a2cc696e4131aafec43dfea04 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/dashboard/client/build/static/media/roboto-latin-300italic.bd5b7a13f2c52b531a2a.woff differ diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/client/build/static/media/roboto-latin-300italic.c64e7e354c88e613c77c.woff2 b/.venv/lib/python3.11/site-packages/ray/dashboard/client/build/static/media/roboto-latin-300italic.c64e7e354c88e613c77c.woff2 new file mode 100644 index 0000000000000000000000000000000000000000..b6653fb978bd439bbecbd81c8eb814f9c53c92de Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/dashboard/client/build/static/media/roboto-latin-300italic.c64e7e354c88e613c77c.woff2 differ diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/client/build/static/media/roboto-latin-500.f5b74d7ffcdf85b9dd60.woff2 b/.venv/lib/python3.11/site-packages/ray/dashboard/client/build/static/media/roboto-latin-500.f5b74d7ffcdf85b9dd60.woff2 new file mode 100644 index 0000000000000000000000000000000000000000..6362d7f64a4d45d901e9f1399e020a476ffa8065 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/dashboard/client/build/static/media/roboto-latin-500.f5b74d7ffcdf85b9dd60.woff2 differ diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/client/build/static/media/roboto-latin-500italic.0d8bb5b3ee5f5dac9e44.woff2 b/.venv/lib/python3.11/site-packages/ray/dashboard/client/build/static/media/roboto-latin-500italic.0d8bb5b3ee5f5dac9e44.woff2 new file mode 100644 index 0000000000000000000000000000000000000000..0ff2f813d3cc88c106f1dee4eb41d800f175dd5b Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/dashboard/client/build/static/media/roboto-latin-500italic.0d8bb5b3ee5f5dac9e44.woff2 differ diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/client/build/static/media/roboto-latin-700.c18ee39fb002ad58b6dc.woff2 b/.venv/lib/python3.11/site-packages/ray/dashboard/client/build/static/media/roboto-latin-700.c18ee39fb002ad58b6dc.woff2 new file mode 100644 index 0000000000000000000000000000000000000000..32b25eee7c5c3309ea53facaaf016256887ec0b2 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/dashboard/client/build/static/media/roboto-latin-700.c18ee39fb002ad58b6dc.woff2 differ diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/client/build/static/media/roboto-latin-700italic.7d8125ff7f707231fd89.woff2 b/.venv/lib/python3.11/site-packages/ray/dashboard/client/build/static/media/roboto-latin-700italic.7d8125ff7f707231fd89.woff2 new file mode 100644 index 0000000000000000000000000000000000000000..fe58be2f7c28aed835f1615e42b5b1a160668f8c Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/dashboard/client/build/static/media/roboto-latin-700italic.7d8125ff7f707231fd89.woff2 differ diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/modules/actor/__init__.py b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/actor/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/modules/actor/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/actor/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..48daf9ed968a25d1c05b82de239b54d01ef747e8 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/actor/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/modules/actor/__pycache__/actor_consts.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/actor/__pycache__/actor_consts.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d35b0b333ed1d22fc5b2d0cc1c2eb45fad9d5402 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/actor/__pycache__/actor_consts.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/modules/actor/__pycache__/actor_head.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/actor/__pycache__/actor_head.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5d171ac4048cd30172b58b5f7ed38887edb61dcd Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/actor/__pycache__/actor_head.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/modules/actor/actor_consts.py b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/actor/actor_consts.py new file mode 100644 index 0000000000000000000000000000000000000000..ffb3b98d575eb13b2ed062f8d9a1a9b26f3cdd6d --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/actor/actor_consts.py @@ -0,0 +1,5 @@ +import ray + +ACTOR_CHANNEL = "ACTOR" +NIL_NODE_ID = ray.NodeID.nil().hex() +RETRY_GET_ALL_ACTOR_INFO_INTERVAL_SECONDS = 1 diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/modules/actor/actor_head.py b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/actor/actor_head.py new file mode 100644 index 0000000000000000000000000000000000000000..4f4cdafd3701be7087d38e38ba311c3b16b1518f --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/actor/actor_head.py @@ -0,0 +1,290 @@ +import asyncio +import logging +from collections import defaultdict, deque +from concurrent.futures import ThreadPoolExecutor +from typing import Any, Dict + +import aiohttp.web + +import ray +import ray.dashboard.optional_utils as dashboard_optional_utils +import ray.dashboard.utils as dashboard_utils +from ray._private.gcs_pubsub import GcsAioActorSubscriber +from ray._private.utils import get_or_create_event_loop +from ray.dashboard.consts import GCS_RPC_TIMEOUT_SECONDS +from ray.dashboard.datacenter import DataOrganizer, DataSource +from ray.dashboard.modules.actor import actor_consts + +logger = logging.getLogger(__name__) +routes = dashboard_optional_utils.DashboardHeadRouteTable + +MAX_DESTROYED_ACTORS_TO_CACHE = max( + 0, ray._config.maximum_gcs_destroyed_actor_cached_count() +) + +ACTOR_CLEANUP_FREQUENCY = 1 # seconds + + +ACTOR_TABLE_STATE_COLUMNS = ( + "state", + "address", + "numRestarts", + "timestamp", + "pid", + "exitDetail", + "startTime", + "endTime", + "reprName", +) + + +def actor_table_data_to_dict(message): + orig_message = dashboard_utils.message_to_dict( + message, + { + "actorId", + "parentId", + "jobId", + "workerId", + "rayletId", + "callerId", + "taskId", + "parentTaskId", + "sourceActorId", + "placementGroupId", + }, + always_print_fields_with_no_presence=True, + ) + # The complete schema for actor table is here: + # src/ray/protobuf/gcs.proto + # It is super big and for dashboard, we don't need that much information. + # Only preserve the necessary ones here for memory usage. + fields = { + "actorId", + "jobId", + "pid", + "address", + "state", + "name", + "numRestarts", + "timestamp", + "className", + "startTime", + "endTime", + "reprName", + "placementGroupId", + "callSite", + } + light_message = {k: v for (k, v) in orig_message.items() if k in fields} + light_message["actorClass"] = orig_message["className"] + exit_detail = "-" + if "deathCause" in orig_message: + context = orig_message["deathCause"] + if "actorDiedErrorContext" in context: + exit_detail = context["actorDiedErrorContext"]["errorMessage"] # noqa + elif "runtimeEnvFailedContext" in context: + exit_detail = context["runtimeEnvFailedContext"]["errorMessage"] # noqa + elif "actorUnschedulableContext" in context: + exit_detail = context["actorUnschedulableContext"]["errorMessage"] # noqa + elif "creationTaskFailureContext" in context: + exit_detail = context["creationTaskFailureContext"][ + "formattedExceptionString" + ] # noqa + light_message["exitDetail"] = exit_detail + light_message["startTime"] = int(light_message["startTime"]) + light_message["endTime"] = int(light_message["endTime"]) + light_message["requiredResources"] = dict(message.required_resources) + + return light_message + + +class ActorHead(dashboard_utils.DashboardHeadModule): + def __init__(self, config: dashboard_utils.DashboardHeadModuleConfig): + super().__init__(config) + + self._gcs_actor_channel_subscriber = None + # A queue of dead actors in order of when they died + self.destroyed_actors_queue = deque() + + # -- Internal state -- + self._loop = get_or_create_event_loop() + # NOTE: This executor is intentionally constrained to just 1 thread to + # limit its concurrency, therefore reducing potential for GIL contention + self._executor = ThreadPoolExecutor( + max_workers=1, thread_name_prefix="actor_head_executor" + ) + + async def _update_actors(self): + """ + Processes actor info. First gets all actors from GCS, then subscribes to + actor updates. For each actor update, updates DataSource.node_actors and + DataSource.actors. + """ + + # To prevent Time-of-check to time-of-use issue [1], the get-all-actor-info + # happens after the subscription. That is, an update between get-all-actor-info + # and the subscription is not missed. + # + # [1] https://en.wikipedia.org/wiki/Time-of-check_to_time-of-use + gcs_addr = self.gcs_address + actor_channel_subscriber = GcsAioActorSubscriber(address=gcs_addr) + await actor_channel_subscriber.subscribe() + + # Get all actor info. + while True: + try: + logger.info("Getting all actor info from GCS.") + + actor_dicts = await self._get_all_actors() + # Update actors + DataSource.actors.reset(actor_dicts) + + # Update node actors and job actors. + node_actors = defaultdict(dict) + for actor_id_bytes, updated_actor_table in actor_dicts.items(): + node_id = updated_actor_table["address"]["rayletId"] + # Update only when node_id is not Nil. + if node_id != actor_consts.NIL_NODE_ID: + node_actors[node_id][actor_id_bytes] = updated_actor_table + + # Update node's actor info + DataSource.node_actors.reset(node_actors) + + logger.info("Received %d actor info from GCS.", len(actor_dicts)) + + # Break, once all initial actors are successfully fetched + break + except Exception as e: + logger.exception("Error Getting all actor info from GCS", exc_info=e) + await asyncio.sleep( + actor_consts.RETRY_GET_ALL_ACTOR_INFO_INTERVAL_SECONDS + ) + + # Pull incremental updates from the GCS channel + while True: + try: + updated_actor_table_entries = await self._poll_updated_actor_table_data( + actor_channel_subscriber + ) + + for ( + actor_id, + updated_actor_table, + ) in updated_actor_table_entries.items(): + self._process_updated_actor_table(actor_id, updated_actor_table) + + # TODO emit metrics + logger.debug( + f"Total events processed: {len(updated_actor_table_entries)}, " + f"queue size: {actor_channel_subscriber.queue_size}" + ) + + except Exception as e: + logger.exception("Error processing actor info from GCS.", exc_info=e) + + async def _poll_updated_actor_table_data( + self, actor_channel_subscriber: GcsAioActorSubscriber + ) -> Dict[str, Dict[str, Any]]: + # TODO make batch size configurable + batch = await actor_channel_subscriber.poll(batch_size=200) + + # NOTE: We're offloading conversion to a TPE to make sure we're not + # blocking the event-loop for prolonged period of time irrespective + # of the batch size + def _convert_to_dict(): + return { + actor_id_bytes.hex(): actor_table_data_to_dict(actor_table_data_message) + for actor_id_bytes, actor_table_data_message in batch + if actor_id_bytes is not None + } + + return await self._loop.run_in_executor(self._executor, _convert_to_dict) + + def _process_updated_actor_table( + self, actor_id: str, actor_table_data: Dict[str, Any] + ): + """NOTE: This method has to be executed on the event-loop, provided that it + accesses DataSource data structures (to follow its thread-safety model)""" + + # If actor is not new registered but updated, we only update + # states related fields. + actor = DataSource.actors.get(actor_id) + + if actor and actor_table_data["state"] != "DEPENDENCIES_UNREADY": + for k in ACTOR_TABLE_STATE_COLUMNS: + if k in actor_table_data: + actor[k] = actor_table_data[k] + actor_table_data = actor + + actor_id = actor_table_data["actorId"] + node_id = actor_table_data["address"]["rayletId"] + + if actor_table_data["state"] == "DEAD": + self.destroyed_actors_queue.append(actor_id) + + # Update actors. + DataSource.actors[actor_id] = actor_table_data + # Update node actors (only when node_id is not Nil). + if node_id != actor_consts.NIL_NODE_ID: + node_actors = DataSource.node_actors.get(node_id, {}) + node_actors[actor_id] = actor_table_data + DataSource.node_actors[node_id] = node_actors + + async def _get_all_actors(self) -> Dict[str, dict]: + actors = await self.gcs_aio_client.get_all_actor_info( + timeout=GCS_RPC_TIMEOUT_SECONDS + ) + + # NOTE: We're offloading conversion to a TPE to make sure we're not + # blocking the event-loop for prolonged period of time for large clusters + def _convert_to_dict(): + return { + actor_id.hex(): actor_table_data_to_dict(actor_table_data) + for actor_id, actor_table_data in actors.items() + } + + return await self._loop.run_in_executor(self._executor, _convert_to_dict) + + async def _cleanup_actors(self): + while True: + try: + while len(self.destroyed_actors_queue) > MAX_DESTROYED_ACTORS_TO_CACHE: + actor_id = self.destroyed_actors_queue.popleft() + if actor_id in DataSource.actors: + actor = DataSource.actors.pop(actor_id) + node_id = actor["address"].get("rayletId") + if node_id and node_id != actor_consts.NIL_NODE_ID: + del DataSource.node_actors[node_id][actor_id] + await asyncio.sleep(ACTOR_CLEANUP_FREQUENCY) + except Exception: + logger.exception("Error cleaning up actor info from GCS.") + + @routes.get("/logical/actors") + @dashboard_optional_utils.aiohttp_cache + async def get_all_actors(self, req) -> aiohttp.web.Response: + actors = await DataOrganizer.get_actor_infos() + return dashboard_optional_utils.rest_response( + success=True, + message="All actors fetched.", + actors=actors, + # False to avoid converting Ray resource name to google style. + # It's not necessary here because the fields are already + # google formatted when protobuf was converted into dict. + convert_google_style=False, + ) + + @routes.get("/logical/actors/{actor_id}") + @dashboard_optional_utils.aiohttp_cache + async def get_actor(self, req) -> aiohttp.web.Response: + actor_id = req.match_info.get("actor_id") + actors = await DataOrganizer.get_actor_infos(actor_ids=[actor_id]) + return dashboard_optional_utils.rest_response( + success=True, message="Actor details fetched.", detail=actors[actor_id] + ) + + async def run(self, server): + await asyncio.gather(self._update_actors(), self._cleanup_actors()) + + @staticmethod + def is_minimal_module(): + return False diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/modules/event/__init__.py b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/event/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/modules/event/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/event/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..feba8197389c80402a2334ce1a2f2583a5b311b5 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/event/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/modules/event/__pycache__/event_agent.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/event/__pycache__/event_agent.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..59ad081529eeb8057475c3f709b01680039f01c1 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/event/__pycache__/event_agent.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/modules/event/__pycache__/event_consts.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/event/__pycache__/event_consts.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..6931ff280cee25cf4eaf2792bac2e99eb0814ab3 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/event/__pycache__/event_consts.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/modules/event/__pycache__/event_head.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/event/__pycache__/event_head.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..37bd4311e06ffc350685047a084f450b24d83e7d Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/event/__pycache__/event_head.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/modules/event/__pycache__/event_utils.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/event/__pycache__/event_utils.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9f7989fe3602a4f435ed02a257c994b406a45cf4 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/event/__pycache__/event_utils.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/modules/event/event_agent.py b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/event/event_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..9ea08c32a7c0cfca76d8b6cf98e1fa29b5961e22 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/event/event_agent.py @@ -0,0 +1,133 @@ +import asyncio +import logging +import os +import time +from concurrent.futures import ThreadPoolExecutor +from typing import Union + +import ray._private.ray_constants as ray_constants +import ray._private.utils as utils +import ray.dashboard.consts as dashboard_consts +import ray.dashboard.utils as dashboard_utils +from ray.core.generated import event_pb2, event_pb2_grpc +from ray.dashboard.modules.event import event_consts +from ray.dashboard.modules.event.event_utils import monitor_events +from ray.dashboard.utils import async_loop_forever, create_task + +logger = logging.getLogger(__name__) + + +# NOTE: Executor in this head is intentionally constrained to just 1 thread by +# default to limit its concurrency, therefore reducing potential for +# GIL contention +RAY_DASHBOARD_EVENT_AGENT_TPE_MAX_WORKERS = ray_constants.env_integer( + "RAY_DASHBOARD_EVENT_AGENT_TPE_MAX_WORKERS", 1 +) + + +class EventAgent(dashboard_utils.DashboardAgentModule): + def __init__(self, dashboard_agent): + super().__init__(dashboard_agent) + self._event_dir = os.path.join(self._dashboard_agent.log_dir, "events") + os.makedirs(self._event_dir, exist_ok=True) + self._monitor: Union[asyncio.Task, None] = None + self._stub: Union[event_pb2_grpc.ReportEventServiceStub, None] = None + self._cached_events = asyncio.Queue(event_consts.EVENT_AGENT_CACHE_SIZE) + self._gcs_aio_client = dashboard_agent.gcs_aio_client + # Total number of event created from this agent. + self.total_event_reported = 0 + # Total number of event report request sent. + self.total_request_sent = 0 + self.module_started = time.monotonic() + + self._executor = ThreadPoolExecutor( + max_workers=RAY_DASHBOARD_EVENT_AGENT_TPE_MAX_WORKERS, + thread_name_prefix="event_agent_executor", + ) + + logger.info("Event agent cache buffer size: %s", self._cached_events.maxsize) + + async def _connect_to_dashboard(self): + """Connect to the dashboard. If the dashboard is not started, then + this method will never returns. + + Returns: + The ReportEventServiceStub object. + """ + while True: + try: + dashboard_rpc_address = await self._gcs_aio_client.internal_kv_get( + dashboard_consts.DASHBOARD_RPC_ADDRESS.encode(), + namespace=ray_constants.KV_NAMESPACE_DASHBOARD, + timeout=1, + ) + dashboard_rpc_address = dashboard_rpc_address.decode() + if dashboard_rpc_address: + logger.info("Report events to %s", dashboard_rpc_address) + options = ray_constants.GLOBAL_GRPC_OPTIONS + channel = utils.init_grpc_channel( + dashboard_rpc_address, options=options, asynchronous=True + ) + return event_pb2_grpc.ReportEventServiceStub(channel) + except Exception: + logger.exception("Connect to dashboard failed.") + await asyncio.sleep( + event_consts.RETRY_CONNECT_TO_DASHBOARD_INTERVAL_SECONDS + ) + + @async_loop_forever(event_consts.EVENT_AGENT_REPORT_INTERVAL_SECONDS) + async def report_events(self): + """Report events from cached events queue. Reconnect to dashboard if + report failed. Log error after retry EVENT_AGENT_RETRY_TIMES. + + This method will never returns. + """ + data = await self._cached_events.get() + self.total_event_reported += len(data) + for _ in range(event_consts.EVENT_AGENT_RETRY_TIMES): + try: + logger.debug("Report %s events.", len(data)) + request = event_pb2.ReportEventsRequest(event_strings=data) + await self._stub.ReportEvents(request) + self.total_request_sent += 1 + break + except Exception: + logger.exception("Report event failed, reconnect to the " "dashboard.") + self._stub = await self._connect_to_dashboard() + else: + data_str = str(data) + limit = event_consts.LOG_ERROR_EVENT_STRING_LENGTH_LIMIT + logger.error( + "Report event failed: %s", + data_str[:limit] + (data_str[limit:] and "..."), + ) + + async def get_internal_states(self): + if self.total_event_reported <= 0 or self.total_request_sent <= 0: + return + + elapsed = time.monotonic() - self.module_started + return { + "total_events_reported": self.total_event_reported, + "Total_report_request": self.total_request_sent, + "queue_size": self._cached_events.qsize(), + "total_uptime": elapsed, + } + + async def run(self, server): + # Connect to dashboard. + self._stub = await self._connect_to_dashboard() + # Start monitor task. + self._monitor = monitor_events( + self._event_dir, + lambda data: create_task(self._cached_events.put(data)), + self._executor, + ) + + await asyncio.gather( + self.report_events(), + ) + + @staticmethod + def is_minimal_module(): + return False diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/modules/event/event_consts.py b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/event/event_consts.py new file mode 100644 index 0000000000000000000000000000000000000000..090a3ce6006ad75623f1c6fc7ee5157e6575f262 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/event/event_consts.py @@ -0,0 +1,21 @@ +from ray._private.ray_constants import env_float, env_integer +from ray.core.generated import event_pb2 + +LOG_ERROR_EVENT_STRING_LENGTH_LIMIT = 1000 +RETRY_CONNECT_TO_DASHBOARD_INTERVAL_SECONDS = 2 +# Monitor events +SCAN_EVENT_DIR_INTERVAL_SECONDS = env_integer("SCAN_EVENT_DIR_INTERVAL_SECONDS", 2) +SCAN_EVENT_START_OFFSET_SECONDS = -30 * 60 +CONCURRENT_READ_LIMIT = 50 +EVENT_READ_LINE_COUNT_LIMIT = 200 +EVENT_READ_LINE_LENGTH_LIMIT = env_integer( + "EVENT_READ_LINE_LENGTH_LIMIT", 2 * 1024 * 1024 +) # 2MB +# Report events +EVENT_AGENT_REPORT_INTERVAL_SECONDS = env_float( + "EVENT_AGENT_REPORT_INTERVAL_SECONDS", 0.1 +) +EVENT_AGENT_RETRY_TIMES = 10 +EVENT_AGENT_CACHE_SIZE = 10240 +# Event sources +EVENT_SOURCE_ALL = event_pb2.Event.SourceType.keys() diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/modules/event/event_head.py b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/event/event_head.py new file mode 100644 index 0000000000000000000000000000000000000000..a2ecdaf6f0932ae7bd6f7ee9803159af41ab4ebf --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/event/event_head.py @@ -0,0 +1,212 @@ +import asyncio +import logging +import os +import time +from collections import OrderedDict, defaultdict +from concurrent.futures import ThreadPoolExecutor +from datetime import datetime +from itertools import islice +from typing import Dict, Union + +import aiohttp.web + +import ray.dashboard.optional_utils as dashboard_optional_utils +import ray.dashboard.utils as dashboard_utils +from ray._private.ray_constants import env_integer +from ray._private.usage.usage_lib import TagKey, record_extra_usage_tag +from ray._private.utils import get_or_create_event_loop +from ray.core.generated import event_pb2, event_pb2_grpc +from ray.dashboard.consts import ( + RAY_STATE_SERVER_MAX_HTTP_REQUEST, + RAY_STATE_SERVER_MAX_HTTP_REQUEST_ALLOWED, + RAY_STATE_SERVER_MAX_HTTP_REQUEST_ENV_NAME, +) +from ray.dashboard.modules.event.event_utils import monitor_events, parse_event_strings +from ray.dashboard.state_api_utils import do_filter, handle_list_api +from ray.util.state.common import ClusterEventState, ListApiOptions, ListApiResponse + +logger = logging.getLogger(__name__) +routes = dashboard_optional_utils.DashboardHeadRouteTable + +JobEvents = OrderedDict +dashboard_utils._json_compatible_types.add(JobEvents) + +MAX_EVENTS_TO_CACHE = int(os.environ.get("RAY_DASHBOARD_MAX_EVENTS_TO_CACHE", 10000)) + +# NOTE: Executor in this head is intentionally constrained to just 1 thread by +# default to limit its concurrency, therefore reducing potential for +# GIL contention +RAY_DASHBOARD_EVENT_HEAD_TPE_MAX_WORKERS = env_integer( + "RAY_DASHBOARD_EVENT_HEAD_TPE_MAX_WORKERS", 1 +) + + +async def _list_cluster_events_impl( + *, all_events, executor: ThreadPoolExecutor, option: ListApiOptions +) -> ListApiResponse: + """ + List all cluster events from the cluster. Made a free function to allow unit tests. + + Returns: + A list of cluster events in the cluster. + The schema of returned "dict" is equivalent to the + `ClusterEventState` protobuf message. + """ + + def transform(all_events) -> ListApiResponse: + result = [] + for _, events in all_events.items(): + for _, event in events.items(): + event["time"] = str(datetime.fromtimestamp(int(event["timestamp"]))) + result.append(event) + + num_after_truncation = len(result) + result.sort(key=lambda entry: entry["timestamp"]) + total = len(result) + result = do_filter(result, option.filters, ClusterEventState, option.detail) + num_filtered = len(result) + # Sort to make the output deterministic. + result = list(islice(result, option.limit)) + return ListApiResponse( + result=result, + total=total, + num_after_truncation=num_after_truncation, + num_filtered=num_filtered, + ) + + return await get_or_create_event_loop().run_in_executor( + executor, transform, all_events + ) + + +class EventHead( + dashboard_utils.DashboardHeadModule, + dashboard_utils.RateLimitedModule, + event_pb2_grpc.ReportEventServiceServicer, +): + def __init__(self, config: dashboard_utils.DashboardHeadModuleConfig): + dashboard_utils.DashboardHeadModule.__init__(self, config) + dashboard_utils.RateLimitedModule.__init__( + self, + min( + RAY_STATE_SERVER_MAX_HTTP_REQUEST, + RAY_STATE_SERVER_MAX_HTTP_REQUEST_ALLOWED, + ), + ) + self._event_dir = os.path.join(self.log_dir, "events") + os.makedirs(self._event_dir, exist_ok=True) + self._monitor: Union[asyncio.Task, None] = None + self.total_report_events_count = 0 + self.total_events_received = 0 + self.module_started = time.monotonic() + # {job_id hex(str): {event_id (str): event (dict)}} + self.events: Dict[str, JobEvents] = defaultdict(JobEvents) + + self._executor = ThreadPoolExecutor( + max_workers=RAY_DASHBOARD_EVENT_HEAD_TPE_MAX_WORKERS, + thread_name_prefix="event_head_executor", + ) + + async def limit_handler_(self): + return dashboard_optional_utils.rest_response( + success=False, + error_message=( + "Max number of in-progress requests=" + f"{self.max_num_call_} reached. " + "To set a higher limit, set environment variable: " + f"export {RAY_STATE_SERVER_MAX_HTTP_REQUEST_ENV_NAME}='xxx'. " + f"Max allowed = {RAY_STATE_SERVER_MAX_HTTP_REQUEST_ALLOWED}" + ), + result=None, + ) + + def _update_events(self, event_list): + # {job_id: {event_id: event}} + all_job_events = defaultdict(JobEvents) + for event in event_list: + event_id = event["event_id"] + custom_fields = event.get("custom_fields") + system_event = False + if custom_fields: + job_id = custom_fields.get("job_id", "global") or "global" + else: + job_id = "global" + if system_event is False: + all_job_events[job_id][event_id] = event + + for job_id, new_job_events in all_job_events.items(): + job_events = self.events[job_id] + job_events.update(new_job_events) + + # Limit the # of events cached if it exceeds the threshold. + if len(job_events) > MAX_EVENTS_TO_CACHE * 1.1: + while len(job_events) > MAX_EVENTS_TO_CACHE: + job_events.popitem(last=False) + + async def ReportEvents(self, request, context): + received_events = [] + if request.event_strings: + received_events.extend(parse_event_strings(request.event_strings)) + logger.debug("Received %d events", len(received_events)) + self._update_events(received_events) + self.total_report_events_count += 1 + self.total_events_received += len(received_events) + return event_pb2.ReportEventsReply(send_success=True) + + async def _periodic_state_print(self): + if self.total_events_received <= 0 or self.total_report_events_count <= 0: + return + + elapsed = time.monotonic() - self.module_started + return { + "total_events_received": self.total_events_received, + "Total_requests_received": self.total_report_events_count, + "total_uptime": elapsed, + } + + @routes.get("/events") + @dashboard_optional_utils.aiohttp_cache + async def get_event(self, req) -> aiohttp.web.Response: + job_id = req.query.get("job_id") + if job_id is None: + all_events = { + job_id: list(job_events.values()) + for job_id, job_events in self.events.items() + } + return dashboard_optional_utils.rest_response( + success=True, message="All events fetched.", events=all_events + ) + + job_events = self.events[job_id] + return dashboard_optional_utils.rest_response( + success=True, + message="Job events fetched.", + job_id=job_id, + events=list(job_events.values()), + ) + + @routes.get("/api/v0/cluster_events") + @dashboard_utils.RateLimitedModule.enforce_max_concurrent_calls + async def list_cluster_events( + self, req: aiohttp.web.Request + ) -> aiohttp.web.Response: + record_extra_usage_tag(TagKey.CORE_STATE_API_LIST_CLUSTER_EVENTS, "1") + + async def list_api_fn(option: ListApiOptions): + return await _list_cluster_events_impl( + all_events=self.events, executor=self._executor, option=option + ) + + return await handle_list_api(list_api_fn, req) + + async def run(self, server): + event_pb2_grpc.add_ReportEventServiceServicer_to_server(self, server) + self._monitor = monitor_events( + self._event_dir, + lambda data: self._update_events(parse_event_strings(data)), + self._executor, + ) + + @staticmethod + def is_minimal_module(): + return False diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/__init__.py b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c47a1dbba637f837ea784ff41e62353caf6a0214 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/__pycache__/grafana_dashboard_factory.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/__pycache__/grafana_dashboard_factory.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..8db113d3b38ef2afbd74628acd756fcf687ae5f3 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/__pycache__/grafana_dashboard_factory.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/__pycache__/metrics_head.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/__pycache__/metrics_head.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..b1967cc97a89c4b8c26d53f16e9b15476c87bcfd Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/__pycache__/metrics_head.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/__pycache__/templates.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/__pycache__/templates.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..5bccdd46d13ce694caa0ba5e56afacd1fcf4b967 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/__pycache__/templates.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/dashboards/common.py b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/dashboards/common.py new file mode 100644 index 0000000000000000000000000000000000000000..c8005d871078aa2f98028fa842edc7c338cc18a6 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/dashboards/common.py @@ -0,0 +1,70 @@ +from dataclasses import dataclass +from typing import List, Optional + + +@dataclass +class GridPos: + x: int + y: int + w: int + h: int + + +@dataclass +class Target: + """Defines a Grafana target (time-series query) within a panel. + + A panel will have one or more targets. By default, all targets are rendered as + stacked area charts, with the exception of legend="MAX", which is rendered as + a blue dotted line. Any legend="FINISHED|FAILED|DEAD|REMOVED" series will also be + rendered hidden by default. + + Attributes: + expr: The prometheus query to evaluate. + legend: The legend string to format for each time-series. + """ + + expr: str + legend: str + + +@dataclass +class Panel: + """Defines a Grafana panel (graph) for the Ray dashboard page. + + A panel contains one or more targets (time-series queries). + + Attributes: + title: Short name of the graph. Note: please keep this in sync with the title + definitions in Metrics.tsx. + description: Long form description of the graph. + id: Integer id used to reference the graph from Metrics.tsx. + unit: The unit to display on the y-axis of the graph. + targets: List of query targets. + fill: Whether or not the graph will be filled by a color. + stack: Whether or not the lines in the graph will be stacked. + """ + + title: str + description: str + id: int + unit: str + targets: List[Target] + fill: int = 10 + stack: bool = True + linewidth: int = 1 + grid_pos: Optional[GridPos] = None + + +@dataclass +class DashboardConfig: + # This dashboard name is an internal key used to determine which env vars + # to check for customization + name: str + # The uid of the dashboard json if not overridden by a user + default_uid: str + panels: List[Panel] + # The global filters applied to all graphs in this dashboard. Users can + # add additional global_filters on top of this. + standard_global_filters: List[str] + base_json_file_name: str diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/dashboards/data_dashboard_panels.py b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/dashboards/data_dashboard_panels.py new file mode 100644 index 0000000000000000000000000000000000000000..f85604c659e1102245ab79a6ed95c9b53f124f9c --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/dashboards/data_dashboard_panels.py @@ -0,0 +1,551 @@ +# ruff: noqa: E501 + +from ray.dashboard.modules.metrics.dashboards.common import ( + DashboardConfig, + Panel, + Target, +) + +# When adding a new panels for an OpRuntimeMetric, follow this format: +# Panel( +# title=title, +# description=metric.metadata.get("description"), +# id=panel_id, +# unit=unit, +# targets=[ +# Target( +# expr=f"sum(ray_data_{metric.name}" +# + "{{{global_filters}}}) by (dataset, operator)", +# legend=legend, +# ) +# ], +# fill=fill, +# stack=stack, +# ) + + +DATA_GRAFANA_PANELS = [ + # Ray Data Metrics (Overview) + Panel( + id=1, + title="Bytes Spilled", + description="Amount spilled by dataset operators. DataContext.enable_get_object_locations_for_metrics must be set to True to report this metric", + unit="bytes", + targets=[ + Target( + expr="sum(ray_data_spilled_bytes{{{global_filters}}}) by (dataset, operator)", + legend="Bytes Spilled: {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=False, + ), + Panel( + id=2, + title="Bytes Allocated", + description="Amount allocated by dataset operators.", + unit="bytes", + targets=[ + Target( + expr="sum(ray_data_allocated_bytes{{{global_filters}}}) by (dataset, operator)", + legend="Bytes Allocated: {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=False, + ), + Panel( + id=3, + title="Bytes Freed", + description="Amount freed by dataset operators.", + unit="bytes", + targets=[ + Target( + expr="sum(ray_data_freed_bytes{{{global_filters}}}) by (dataset, operator)", + legend="Bytes Freed: {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=False, + ), + Panel( + id=4, + title="Object Store Memory", + description="Amount of memory store used by dataset operators.", + unit="bytes", + targets=[ + Target( + expr="sum(ray_data_current_bytes{{{global_filters}}}) by (dataset, operator)", + legend="Current Usage: {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=False, + ), + Panel( + id=5, + title="CPUs (logical slots)", + description="Logical CPUs allocated to dataset operators.", + unit="cores", + targets=[ + Target( + expr="sum(ray_data_cpu_usage_cores{{{global_filters}}}) by (dataset, operator)", + legend="CPU Usage: {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=False, + ), + Panel( + id=6, + title="GPUs (logical slots)", + description="Logical GPUs allocated to dataset operators.", + unit="cores", + targets=[ + Target( + expr="sum(ray_data_gpu_usage_cores{{{global_filters}}}) by (dataset, operator)", + legend="GPU Usage: {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=False, + ), + Panel( + id=7, + title="Bytes Output / Second", + description="Bytes output per second by dataset operators.", + unit="Bps", + targets=[ + Target( + expr="sum(rate(ray_data_output_bytes{{{global_filters}}}[1m])) by (dataset, operator)", + legend="Bytes Output / Second: {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=False, + ), + Panel( + id=11, + title="Rows Output / Second", + description="Total rows output per second by dataset operators.", + unit="rows/sec", + targets=[ + Target( + expr="sum(rate(ray_data_output_rows{{{global_filters}}}[1m])) by (dataset, operator)", + legend="Rows Output / Second: {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=False, + ), + # Ray Data Metrics (Inputs) + Panel( + id=17, + title="Input Blocks Received by Operator / Second", + description="Number of input blocks received by operator per second.", + unit="blocks/sec", + targets=[ + Target( + expr="sum(rate(ray_data_num_inputs_received{{{global_filters}}}[1m])) by (dataset, operator)", + legend="Blocks Received / Second: {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=False, + ), + Panel( + id=18, + title="Input Bytes Received by Operator / Second", + description="Byte size of input blocks received by operator per second.", + unit="Bps", + targets=[ + Target( + expr="sum(rate(ray_data_bytes_inputs_received{{{global_filters}}}[1m])) by (dataset, operator)", + legend="Bytes Received / Second: {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=False, + ), + Panel( + id=19, + title="Input Blocks Processed by Tasks / Second", + description=( + "Number of input blocks that operator's tasks have finished processing per second." + ), + unit="blocks/sec", + targets=[ + Target( + expr="sum(rate(ray_data_num_task_inputs_processed{{{global_filters}}}[1m])) by (dataset, operator)", + legend="Blocks Processed / Second: {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=False, + ), + Panel( + id=20, + title="Input Bytes Processed by Tasks / Second", + description=( + "Byte size of input blocks that operator's tasks have finished processing per second." + ), + unit="Bps", + targets=[ + Target( + expr="sum(rate(ray_data_bytes_task_inputs_processed{{{global_filters}}}[1m])) by (dataset, operator)", + legend="Bytes Processed / Second: {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=False, + ), + Panel( + id=21, + title="Input Bytes Submitted to Tasks / Second", + description="Byte size of input blocks passed to submitted tasks per second.", + unit="Bps", + targets=[ + Target( + expr="sum(rate(ray_data_bytes_inputs_of_submitted_tasks{{{global_filters}}}[1m])) by (dataset, operator)", + legend="Bytes Submitted / Second: {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=False, + ), + Panel( + id=22, + title="Blocks Generated by Tasks / Second", + description="Number of output blocks generated by tasks per second.", + unit="blocks/sec", + targets=[ + Target( + expr="sum(rate(ray_data_num_task_outputs_generated{{{global_filters}}}[1m])) by (dataset, operator)", + legend="Blocks Generated / Second: {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=False, + ), + Panel( + id=23, + title="Bytes Generated by Tasks / Second", + description="Byte size of output blocks generated by tasks per second.", + unit="Bps", + targets=[ + Target( + expr="sum(rate(ray_data_bytes_task_outputs_generated{{{global_filters}}}[1m])) by (dataset, operator)", + legend="Bytes Generated / Second: {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=False, + ), + Panel( + id=24, + title="Rows Generated by Tasks / Second", + description="Number of rows in generated output blocks from finished tasks per second.", + unit="rows/sec", + targets=[ + Target( + expr="sum(rate(ray_data_rows_task_outputs_generated{{{global_filters}}}[1m])) by (dataset, operator)", + legend="Rows Generated / Second: {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=False, + ), + Panel( + id=25, + title="Output Blocks Taken by Downstream Operators / Second", + description="Number of output blocks taken by downstream operators per second.", + unit="blocks/sec", + targets=[ + Target( + expr="sum(rate(ray_data_num_outputs_taken{{{global_filters}}}[1m])) by (dataset, operator)", + legend="Blocks Taken / Second: {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=False, + ), + Panel( + id=26, + title="Output Bytes Taken by Downstream Operators / Second", + description=( + "Byte size of output blocks taken by downstream operators per second." + ), + unit="Bps", + targets=[ + Target( + expr="sum(rate(ray_data_bytes_outputs_taken{{{global_filters}}}[1m])) by (dataset, operator)", + legend="Bytes Taken / Second: {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=False, + ), + # Ray Data Metrics (Tasks) + Panel( + id=29, + title="Submitted Tasks", + description="Number of submitted tasks.", + unit="tasks", + targets=[ + Target( + expr="sum(ray_data_num_tasks_submitted{{{global_filters}}}) by (dataset, operator)", + legend="Submitted Tasks: {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=False, + ), + Panel( + id=30, + title="Running Tasks", + description="Number of running tasks.", + unit="tasks", + targets=[ + Target( + expr="sum(ray_data_num_tasks_running{{{global_filters}}}) by (dataset, operator)", + legend="Running Tasks: {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=False, + ), + Panel( + id=31, + title="Tasks with output blocks", + description="Number of tasks that already have output.", + unit="tasks", + targets=[ + Target( + expr="sum(ray_data_num_tasks_have_outputs{{{global_filters}}}) by (dataset, operator)", + legend="Tasks with output blocks: {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=False, + ), + Panel( + id=32, + title="Finished Tasks", + description="Number of finished tasks.", + unit="tasks", + targets=[ + Target( + expr="sum(ray_data_num_tasks_finished{{{global_filters}}}) by (dataset, operator)", + legend="Finished Tasks: {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=False, + ), + Panel( + id=33, + title="Failed Tasks", + description="Number of failed tasks.", + unit="tasks", + targets=[ + Target( + expr="sum(ray_data_num_tasks_failed{{{global_filters}}}) by (dataset, operator)", + legend="Failed Tasks: {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=False, + ), + Panel( + id=8, + title="Block Generation Time", + description="Time spent generating blocks in tasks.", + unit="seconds", + targets=[ + Target( + expr="sum(ray_data_block_generation_time{{{global_filters}}}) by (dataset, operator)", + legend="Block Generation Time: {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=False, + ), + Panel( + id=37, + title="Task Submission Backpressure Time", + description="Time spent in task submission backpressure.", + unit="seconds", + targets=[ + Target( + expr="sum(ray_data_task_submission_backpressure_time{{{global_filters}}}) by (dataset, operator)", + legend="Backpressure Time: {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=True, + ), + # Ray Data Metrics (Object Store Memory) + Panel( + id=13, + title="Operator Internal Inqueue Size (Blocks)", + description="Number of blocks in operator's internal input queue", + unit="blocks", + targets=[ + Target( + expr="sum(ray_data_obj_store_mem_internal_inqueue_blocks{{{global_filters}}}) by (dataset, operator)", + legend="Number of Blocks: {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=False, + ), + Panel( + id=14, + title="Operator Internal Inqueue Size (Bytes)", + description="Byte size of input blocks in the operator's internal input queue.", + unit="bytes", + targets=[ + Target( + expr="sum(ray_data_obj_store_mem_internal_inqueue{{{global_filters}}}) by (dataset, operator)", + legend="Bytes Size: {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=True, + ), + Panel( + id=15, + title="Operator Internal Outqueue Size (Blocks)", + description="Number of blocks in operator's internal output queue", + unit="blocks", + targets=[ + Target( + expr="sum(ray_data_obj_store_mem_internal_outqueue_blocks{{{global_filters}}}) by (dataset, operator)", + legend="Number of Blocks: {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=False, + ), + Panel( + id=16, + title="Operator Internal Outqueue Size (Bytes)", + description=( + "Byte size of output blocks in the operator's internal output queue." + ), + unit="bytes", + targets=[ + Target( + expr="sum(ray_data_obj_store_mem_internal_outqueue{{{global_filters}}}) by (dataset, operator)", + legend="Bytes Size: {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=True, + ), + Panel( + id=34, + title="Size of Blocks used in Pending Tasks (Bytes)", + description="Byte size of input blocks used by pending tasks.", + unit="bytes", + targets=[ + Target( + expr="sum(ray_data_obj_store_mem_pending_task_inputs{{{global_filters}}}) by (dataset, operator)", + legend="Bytes Size: {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=True, + ), + Panel( + id=35, + title="Freed Memory in Object Store (Bytes)", + description="Byte size of freed memory in object store.", + unit="bytes", + targets=[ + Target( + expr="sum(ray_data_obj_store_mem_freed{{{global_filters}}}) by (dataset, operator)", + legend="Bytes Size: {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=True, + ), + Panel( + id=36, + title="Spilled Memory in Object Store (Bytes)", + description="Byte size of spilled memory in object store.", + unit="bytes", + targets=[ + Target( + expr="sum(ray_data_obj_store_mem_spilled{{{global_filters}}}) by (dataset, operator)", + legend="Bytes Size: {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=True, + ), + # Ray Data Metrics (Iteration) + Panel( + id=12, + title="Iteration Initialization Time", + description="Seconds spent in iterator initialization code", + unit="seconds", + targets=[ + Target( + expr="sum(ray_data_iter_initialize_seconds{{{global_filters}}}) by (dataset)", + legend="Seconds: {{dataset}}, {{operator}}", + ) + ], + fill=0, + stack=False, + ), + Panel( + id=9, + title="Iteration Blocked Time", + description="Seconds user thread is blocked by iter_batches()", + unit="seconds", + targets=[ + Target( + expr="sum(ray_data_iter_total_blocked_seconds{{{global_filters}}}) by (dataset)", + legend="Seconds: {{dataset}}", + ) + ], + fill=0, + stack=False, + ), + Panel( + id=10, + title="Iteration User Time", + description="Seconds spent in user code", + unit="seconds", + targets=[ + Target( + expr="sum(ray_data_iter_user_seconds{{{global_filters}}}) by (dataset)", + legend="Seconds: {{dataset}}", + ) + ], + fill=0, + stack=False, + ), + # Ray Data Metrics (Miscellaneous) +] + +ids = [] +for panel in DATA_GRAFANA_PANELS: + ids.append(panel.id) +assert len(ids) == len( + set(ids) +), f"Duplicated id found. Use unique id for each panel. {ids}" + +data_dashboard_config = DashboardConfig( + name="DATA", + default_uid="rayDataDashboard", + panels=DATA_GRAFANA_PANELS, + standard_global_filters=[ + 'dataset=~"$DatasetID"', + 'SessionName=~"$SessionName"', + 'ray_io_cluster=~"$Cluster"', + ], + base_json_file_name="data_grafana_dashboard_base.json", +) diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/dashboards/default_dashboard_panels.py b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/dashboards/default_dashboard_panels.py new file mode 100644 index 0000000000000000000000000000000000000000..c39c60ec47196ba71a54ce658c896892f5543aa8 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/dashboards/default_dashboard_panels.py @@ -0,0 +1,478 @@ +# ruff: noqa: E501 + +from ray.dashboard.modules.metrics.dashboards.common import ( + DashboardConfig, + Panel, + Target, +) + +""" +Queries for autoscaler resources. +""" +# Note: MAX & USED resources are reported from raylet to provide the most up to date information. +# But MAX + PENDING data is coming from the autoscaler. That said, MAX + PENDING can be +# more outdated. it is harmless because the actual MAX will catch up with MAX + PENDING +# eventually. +MAX_CPUS = 'sum(autoscaler_cluster_resources{{resource="CPU",{global_filters}}})' +PENDING_CPUS = 'sum(autoscaler_pending_resources{{resource="CPU",{global_filters}}})' +MAX_GPUS = 'sum(autoscaler_cluster_resources{{resource="GPU",{global_filters}}})' +PENDING_GPUS = 'sum(autoscaler_pending_resources{{resource="GPU",{global_filters}}})' + + +def max_plus_pending(max_resource, pending_resource): + return f"({max_resource} or vector(0)) + ({pending_resource} or vector(0))" + + +MAX_PLUS_PENDING_CPUS = max_plus_pending(MAX_CPUS, PENDING_CPUS) +MAX_PLUS_PENDING_GPUS = max_plus_pending(MAX_GPUS, PENDING_GPUS) + + +# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +# IMPORTANT: Please keep this in sync with Metrics.tsx and ray-metrics.rst +# !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! +DEFAULT_GRAFANA_PANELS = [ + Panel( + id=26, + title="Scheduler Task State", + description="Current number of tasks in a particular state.\n\nState: the task state, as described by rpc::TaskState proto in common.proto. Task resubmissions due to failures or object reconstruction are shown with (retry) in the label.", + unit="tasks", + targets=[ + Target( + expr='sum(max_over_time(ray_tasks{{IsRetry="0",State=~"FINISHED|FAILED",instance=~"$Instance",{global_filters}}}[14d])) by (State) or clamp_min(sum(ray_tasks{{IsRetry="0",State!~"FINISHED|FAILED",instance=~"$Instance",{global_filters}}}) by (State), 0)', + legend="{{State}}", + ), + Target( + expr='sum(max_over_time(ray_tasks{{IsRetry!="0",State=~"FINISHED|FAILED",instance=~"$Instance",{global_filters}}}[14d])) by (State) or clamp_min(sum(ray_tasks{{IsRetry!="0",State!~"FINISHED|FAILED",instance=~"$Instance",{global_filters}}}) by (State), 0)', + legend="{{State}} (retry)", + ), + ], + fill=0, + stack=False, + ), + Panel( + id=35, + title="Requested Live Tasks by Name", + description="Current number of (live) tasks with a particular name. Task resubmissions due to failures or object reconstruction are shown with (retry) in the label.", + unit="tasks", + targets=[ + Target( + expr='clamp_min(sum(ray_tasks{{IsRetry="0",State!~"FINISHED|FAILED",instance=~"$Instance",{global_filters}}}) by (Name), 0)', + legend="{{Name}}", + ), + Target( + expr='clamp_min(sum(ray_tasks{{IsRetry!="0",State!~"FINISHED|FAILED",instance=~"$Instance",{global_filters}}}) by (Name), 0)', + legend="{{Name}} (retry)", + ), + ], + fill=0, + stack=False, + ), + Panel( + id=38, + title="Running Tasks by Name", + description="Current number of (running) tasks with a particular name. Task resubmissions due to failures or object reconstruction are shown with (retry) in the label.", + unit="tasks", + targets=[ + Target( + expr='clamp_min(sum(ray_tasks{{IsRetry="0",State=~"RUNNING*",instance=~"$Instance",{global_filters}}}) by (Name), 0)', + legend="{{Name}}", + ), + Target( + expr='clamp_min(sum(ray_tasks{{IsRetry!="0",State=~"RUNNING*",instance=~"$Instance",{global_filters}}}) by (Name), 0)', + legend="{{Name}} (retry)", + ), + ], + fill=0, + stack=False, + ), + Panel( + id=33, + title="Scheduler Actor State", + description='Note: not impacted by "Instance" variable.\n\nCurrent number of actors in a particular state.\n\nState: the actor state, as described by rpc::ActorTableData proto in gcs.proto.', + unit="actors", + targets=[ + Target( + expr='sum(ray_actors{{Source="gcs",{global_filters}}}) by (State)', + legend="{{State}}", + ) + ], + ), + Panel( + id=42, + title="Live Actor State", + description="Current number of alive actors in a particular state.\n\nState: IDLE, RUNNING_TASK, RUNNING_IN_RAY_GET, RUNNING_IN_RAY_WAIT", + unit="actors", + targets=[ + Target( + expr='sum(ray_actors{{Source="executor",NodeAddress=~"$Instance",{global_filters}}}) by (State)', + legend="{{State}}", + ) + ], + ), + Panel( + id=36, + title="Live Actors by Name", + description="Current number of alive actors with a particular name.", + unit="actors", + targets=[ + Target( + expr='sum(ray_actors{{State!="DEAD",Source="executor",NodeAddress=~"$Instance",{global_filters}}}) by (Name)', + legend="{{Name}}", + ) + ], + ), + Panel( + id=27, + title="Scheduler CPUs (logical slots)", + description="Logical CPU usage of Ray. The dotted line indicates the total number of CPUs. The logical CPU is allocated by `num_cpus` arguments from tasks and actors. PENDING means the number of CPUs that will be available when new nodes are up after the autoscaler scales up.\n\nNOTE: Ray's logical CPU is different from physical CPU usage. Ray's logical CPU is allocated by `num_cpus` arguments.", + unit="cores", + targets=[ + Target( + expr='sum(ray_resources{{Name="CPU",State="USED",instance=~"$Instance",{global_filters}}}) by (instance)', + legend="CPU Usage: {{instance}}", + ), + Target( + expr='sum(ray_resources{{Name="CPU",instance=~"$Instance",{global_filters}}})', + legend="MAX", + ), + # If max + pending > max, we display this value. + # (A and predicate) means to return A when the predicate satisfies in PromSql. + Target( + expr=f"({MAX_PLUS_PENDING_CPUS} and {MAX_PLUS_PENDING_CPUS} > ({MAX_CPUS} or vector(0)))", + legend="MAX + PENDING", + ), + ], + ), + Panel( + id=29, + title="Object Store Memory", + description="Object store memory usage by location. The dotted line indicates the object store memory capacity.\n\nLocation: where the memory was allocated, which is MMAP_SHM or MMAP_DISK to indicate memory-mapped page, SPILLED to indicate spillage to disk, and WORKER_HEAP for objects small enough to be inlined in worker memory. Refer to metric_defs.cc for more information.", + unit="bytes", + targets=[ + Target( + expr='sum(ray_object_store_memory{{instance=~"$Instance",{global_filters}}}) by (Location)', + legend="{{Location}}", + ), + Target( + expr='sum(ray_resources{{Name="object_store_memory",instance=~"$Instance",{global_filters}}})', + legend="MAX", + ), + ], + ), + Panel( + id=28, + title="Scheduler GPUs (logical slots)", + description="Logical GPU usage of Ray. The dotted line indicates the total number of GPUs. The logical GPU is allocated by `num_gpus` arguments from tasks and actors. PENDING means the number of GPUs that will be available when new nodes are up after the autoscaler scales up.", + unit="GPUs", + targets=[ + Target( + expr='ray_resources{{Name="GPU",State="USED",instance=~"$Instance",{global_filters}}}', + legend="GPU Usage: {{instance}}", + ), + Target( + expr='sum(ray_resources{{Name="GPU",instance=~"$Instance",{global_filters}}})', + legend="MAX", + ), + # If max + pending > max, we display this value. + # (A and predicate) means to return A when the predicate satisfies in PromSql. + Target( + expr=f"({MAX_PLUS_PENDING_GPUS} and {MAX_PLUS_PENDING_GPUS} > ({MAX_GPUS} or vector(0)))", + legend="MAX + PENDING", + ), + ], + ), + Panel( + id=40, + title="Scheduler Placement Groups", + description='Note: not impacted by "Instance" variable.\n\nCurrent number of placement groups in a particular state.\n\nState: the placement group state, as described by the rpc::PlacementGroupTable proto in gcs.proto.', + unit="placement groups", + targets=[ + Target( + expr="sum(ray_placement_groups{{{global_filters}}}) by (State)", + legend="{{State}}", + ) + ], + ), + Panel( + id=2, + title="Node CPU (hardware utilization)", + description="", + unit="cores", + targets=[ + Target( + expr='ray_node_cpu_utilization{{instance=~"$Instance", IsHeadNode="false", {global_filters}}} * ray_node_cpu_count{{instance=~"$Instance",{global_filters}}} / 100', + legend="CPU Usage: {{instance}}", + ), + Target( + expr='ray_node_cpu_utilization{{instance=~"$Instance", IsHeadNode="true", {global_filters}}} * ray_node_cpu_count{{instance=~"$Instance",{global_filters}}} / 100', + legend="CPU Usage: {{instance}} (head)", + ), + Target( + expr='sum(ray_node_cpu_count{{instance=~"$Instance",{global_filters}}})', + legend="MAX", + ), + ], + ), + Panel( + id=8, + title="Node GPU (hardware utilization)", + description="Node's physical (hardware) GPU usage. The dotted line means the total number of hardware GPUs from the cluster. ", + unit="GPUs", + targets=[ + Target( + expr='ray_node_gpus_utilization{{instance=~"$Instance", IsHeadNode="false", {global_filters}}} / 100', + legend="GPU Usage: {{instance}}, gpu.{{GpuIndex}}, {{GpuDeviceName}}", + ), + Target( + expr='ray_node_gpus_utilization{{instance=~"$Instance", IsHeadNode="true", {global_filters}}} / 100', + legend="GPU Usage: {{instance}} (head), gpu.{{GpuIndex}}, {{GpuDeviceName}}", + ), + Target( + expr='sum(ray_node_gpus_available{{instance=~"$Instance",{global_filters}}})', + legend="MAX", + ), + ], + ), + Panel( + id=6, + title="Node Disk", + description="Node's physical (hardware) disk usage. The dotted line means the total amount of disk space from the cluster.\n\nNOTE: When Ray is deployed within a container, this shows the disk usage from the host machine. ", + unit="bytes", + targets=[ + Target( + expr='ray_node_disk_usage{{instance=~"$Instance", IsHeadNode="false", {global_filters}}}', + legend="Disk Used: {{instance}}", + ), + Target( + expr='ray_node_disk_usage{{instance=~"$Instance", IsHeadNode="true", {global_filters}}}', + legend="Disk Used: {{instance}} (head)", + ), + Target( + expr='sum(ray_node_disk_free{{instance=~"$Instance",{global_filters}}}) + sum(ray_node_disk_usage{{instance=~"$Instance",{global_filters}}})', + legend="MAX", + ), + ], + ), + Panel( + id=32, + title="Node Disk IO Speed", + description="Disk IO per node.", + unit="Bps", + targets=[ + Target( + expr='ray_node_disk_io_write_speed{{instance=~"$Instance", IsHeadNode="false", {global_filters}}}', + legend="Write: {{instance}}", + ), + Target( + expr='ray_node_disk_io_write_speed{{instance=~"$Instance", IsHeadNode="true", {global_filters}}}', + legend="Write: {{instance}} (head)", + ), + Target( + expr='ray_node_disk_io_read_speed{{instance=~"$Instance", IsHeadNode="false", {global_filters}}}', + legend="Read: {{instance}}", + ), + Target( + expr='ray_node_disk_io_read_speed{{instance=~"$Instance", IsHeadNode="true", {global_filters}}}', + legend="Read: {{instance}} (head)", + ), + ], + ), + Panel( + id=4, + title="Node Memory (heap + object store)", + description="The physical (hardware) memory usage for each node. The dotted line means the total amount of memory from the cluster. Node memory is a sum of object store memory (shared memory) and heap memory.\n\nNote: If Ray is deployed within a container, the total memory could be lower than the host machine because Ray may reserve some additional memory space outside the container.", + unit="bytes", + targets=[ + Target( + expr='ray_node_mem_used{{instance=~"$Instance", IsHeadNode="false", {global_filters}}}', + legend="Memory Used: {{instance}}", + ), + Target( + expr='ray_node_mem_used{{instance=~"$Instance", IsHeadNode="true", {global_filters}}}', + legend="Memory Used: {{instance}} (head)", + ), + Target( + expr='sum(ray_node_mem_total{{instance=~"$Instance",{global_filters}}})', + legend="MAX", + ), + ], + ), + Panel( + id=48, + title="Node Memory Percentage (heap + object store)", + description="The percentage of physical (hardware) memory usage for each node.", + unit="%", + targets=[ + Target( + expr='ray_node_mem_used{{instance=~"$Instance", IsHeadNode="false", {global_filters}}}/ray_node_mem_total{{instance=~"$Instance", IsHeadNode="false", {global_filters}}} * 100', + legend="Memory Used: {{instance}}", + ), + Target( + expr='ray_node_mem_used{{instance=~"$Instance", IsHeadNode="true", {global_filters}}}/ray_node_mem_total{{instance=~"$Instance", IsHeadNode="true", {global_filters}}} * 100', + legend="Memory Used: {{instance}} (head)", + ), + ], + fill=0, + stack=False, + ), + Panel( + id=44, + title="Node Out of Memory Failures by Name", + description="The number of tasks and actors killed by the Ray Out of Memory killer due to high memory pressure. Metrics are broken down by IP and the name. https://docs.ray.io/en/master/ray-core/scheduling/ray-oom-prevention.html.", + unit="failures", + targets=[ + Target( + expr='ray_memory_manager_worker_eviction_total{{instance=~"$Instance",{global_filters}}}', + legend="OOM Killed: {{Name}}, {{instance}}", + ), + ], + ), + Panel( + id=34, + title="Node Memory by Component", + description="The physical (hardware) memory usage across the cluster, broken down by component. This reports the summed RSS-SHM per Ray component, which corresponds to an approximate memory usage per proc. Ray components consist of system components (e.g., raylet, gcs, dashboard, or agent) and the process (that contains method names) names of running tasks/actors.", + unit="bytes", + targets=[ + Target( + expr='(sum(ray_component_rss_mb{{instance=~"$Instance",{global_filters}}} * 1e6) by (Component)) - (sum(ray_component_mem_shared_bytes{{instance=~"$Instance",{global_filters}}}) by (Component))', + legend="{{Component}}", + ), + Target( + expr='sum(ray_node_mem_shared_bytes{{instance=~"$Instance",{global_filters}}})', + legend="shared_memory", + ), + Target( + expr='sum(ray_node_mem_total{{instance=~"$Instance",{global_filters}}})', + legend="MAX", + ), + ], + ), + Panel( + id=37, + title="Node CPU by Component", + description="The physical (hardware) CPU usage across the cluster, broken down by component. This reports the summed CPU usage per Ray component. Ray components consist of system components (e.g., raylet, gcs, dashboard, or agent) and the process (that contains method names) names of running tasks/actors.", + unit="cores", + targets=[ + Target( + # ray_component_cpu_percentage returns a percentage that can be > 100. It means that it uses more than 1 CPU. + expr='sum(ray_component_cpu_percentage{{instance=~"$Instance",{global_filters}}}) by (Component) / 100', + legend="{{Component}}", + ), + Target( + expr='sum(ray_node_cpu_count{{instance=~"$Instance",{global_filters}}})', + legend="MAX", + ), + ], + ), + Panel( + id=18, + title="Node GPU Memory (GRAM)", + description="The physical (hardware) GPU memory usage for each node. The dotted line means the total amount of GPU memory from the cluster.", + unit="bytes", + targets=[ + Target( + expr='ray_node_gram_used{{instance=~"$Instance",{global_filters}}} * 1024 * 1024', + legend="Used GRAM: {{instance}}, gpu.{{GpuIndex}}, {{GpuDeviceName}}", + ), + Target( + expr='(sum(ray_node_gram_available{{instance=~"$Instance",{global_filters}}}) + sum(ray_node_gram_used{{instance=~"$Instance",{global_filters}}})) * 1024 * 1024', + legend="MAX", + ), + ], + ), + Panel( + id=20, + title="Node Network", + description="Network speed per node", + unit="Bps", + targets=[ + Target( + expr='ray_node_network_receive_speed{{instance=~"$Instance",{global_filters}}}', + legend="Recv: {{instance}}", + ), + Target( + expr='ray_node_network_send_speed{{instance=~"$Instance",{global_filters}}}', + legend="Send: {{instance}}", + ), + ], + ), + Panel( + id=24, + title="Node Count", + description='Note: not impacted by "Instance" variable.\n\nA total number of active failed, and pending nodes from the cluster. \n\nACTIVE: A node is alive and available.\n\nFAILED: A node is dead and not available. The node is considered dead when the raylet process on the node is terminated. The node will get into the failed state if it cannot be provided (e.g., there\'s no available node from the cloud provider) or failed to setup (e.g., setup_commands have errors). \n\nPending: A node is being started by the Ray cluster launcher. The node is unavailable now because it is being provisioned and initialized.', + unit="nodes", + targets=[ + Target( + expr="sum(autoscaler_active_nodes{{{global_filters}}}) by (NodeType)", + legend="Active Nodes: {{NodeType}}", + ), + Target( + expr="sum(autoscaler_recently_failed_nodes{{{global_filters}}}) by (NodeType)", + legend="Failed Nodes: {{NodeType}}", + ), + Target( + expr="sum(autoscaler_pending_nodes{{{global_filters}}}) by (NodeType)", + legend="Pending Nodes: {{NodeType}}", + ), + ], + ), + Panel( + id=41, + title="Cluster Utilization", + description="Aggregated utilization of all physical resources (CPU, GPU, memory, disk, or etc.) across the cluster.", + unit="%", + targets=[ + # CPU + Target( + expr='avg(ray_node_cpu_utilization{{instance=~"$Instance",{global_filters}}})', + legend="CPU (physical)", + ), + # GPU + Target( + expr='sum(ray_node_gpus_utilization{{instance=~"$Instance",{global_filters}}}) / on() (sum(autoscaler_cluster_resources{{resource="GPU",instance=~"$Instance",{global_filters}}}) or vector(0))', + legend="GPU (physical)", + ), + # Memory + Target( + expr='sum(ray_node_mem_used{{instance=~"$Instance",{global_filters}}}) / on() (sum(ray_node_mem_total{{instance=~"$Instance",{global_filters}}})) * 100', + legend="Memory (RAM)", + ), + # GRAM + Target( + expr='sum(ray_node_gram_used{{instance=~"$Instance",{global_filters}}}) / on() (sum(ray_node_gram_available{{instance=~"$Instance",{global_filters}}}) + sum(ray_node_gram_used{{instance=~"$Instance",{global_filters}}})) * 100', + legend="GRAM", + ), + # Object Store + Target( + expr='sum(ray_object_store_memory{{instance=~"$Instance",{global_filters}}}) / on() sum(ray_resources{{Name="object_store_memory",instance=~"$Instance",{global_filters}}}) * 100', + legend="Object Store Memory", + ), + # Disk + Target( + expr='sum(ray_node_disk_usage{{instance=~"$Instance",{global_filters}}}) / on() (sum(ray_node_disk_free{{instance=~"$Instance",{global_filters}}}) + sum(ray_node_disk_usage{{instance=~"$Instance",{global_filters}}})) * 100', + legend="Disk", + ), + ], + fill=0, + stack=False, + ), +] + + +ids = [] +for panel in DEFAULT_GRAFANA_PANELS: + ids.append(panel.id) +assert len(ids) == len( + set(ids) +), f"Duplicated id found. Use unique id for each panel. {ids}" + +default_dashboard_config = DashboardConfig( + name="DEFAULT", + default_uid="rayDefaultDashboard", + panels=DEFAULT_GRAFANA_PANELS, + standard_global_filters=[ + 'SessionName=~"$SessionName"', + 'ray_io_cluster=~"$Cluster"', + ], + base_json_file_name="default_grafana_dashboard_base.json", +) diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/dashboards/serve_dashboard_panels.py b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/dashboards/serve_dashboard_panels.py new file mode 100644 index 0000000000000000000000000000000000000000..9bc72fbe8b7c5bb32e8eb7a467c4c21b64c84a27 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/dashboards/serve_dashboard_panels.py @@ -0,0 +1,420 @@ +# ruff: noqa: E501 + +from ray.dashboard.modules.metrics.dashboards.common import ( + DashboardConfig, + GridPos, + Panel, + Target, +) + +SERVE_GRAFANA_PANELS = [ + Panel( + id=5, + title="Cluster Utilization", + description="Aggregated utilization of all physical resources (CPU, GPU, memory, disk, or etc.) across the cluster. Ignores application variable.", + unit="%", + targets=[ + # CPU + Target( + expr="avg(ray_node_cpu_utilization{{{global_filters}}})", + legend="CPU (physical)", + ), + # GPU + Target( + expr="sum(ray_node_gpus_utilization{{{global_filters}}}) / on() (sum(autoscaler_cluster_resources{{resource='GPU',{global_filters}}}) or vector(0))", + legend="GPU (physical)", + ), + # Memory + Target( + expr="sum(ray_node_mem_used{{{global_filters}}}) / on() (sum(ray_node_mem_total{{{global_filters}}})) * 100", + legend="Memory (RAM)", + ), + # GRAM + Target( + expr="sum(ray_node_gram_used{{{global_filters}}}) / on() (sum(ray_node_gram_available{{{global_filters}}}) + sum(ray_node_gram_used{{{global_filters}}})) * 100", + legend="GRAM", + ), + # Object Store + Target( + expr='sum(ray_object_store_memory{{{global_filters}}}) / on() sum(ray_resources{{Name="object_store_memory",{global_filters}}}) * 100', + legend="Object Store Memory", + ), + # Disk + Target( + expr="sum(ray_node_disk_usage{{{global_filters}}}) / on() (sum(ray_node_disk_free{{{global_filters}}}) + sum(ray_node_disk_usage{{{global_filters}}})) * 100", + legend="Disk", + ), + ], + fill=0, + stack=False, + grid_pos=GridPos(0, 0, 12, 8), + ), + Panel( + id=7, + title="QPS per application", + description="QPS for each selected application.", + unit="qps", + targets=[ + Target( + expr='sum(rate(ray_serve_num_http_requests_total{{application=~"$Application",application!~"",route=~"$HTTP_Route",route!~"/-/.*",{global_filters}}}[5m])) by (application, route)', + legend="{{application, route}}", + ), + Target( + expr='sum(rate(ray_serve_num_grpc_requests_total{{application=~"$Application",application!~"",method=~"$gRPC_Method",{global_filters}}}[5m])) by (application, method)', + legend="{{application, method}}", + ), + ], + grid_pos=GridPos(12, 0, 12, 8), + ), + Panel( + id=8, + title="Error QPS per application", + description="Error QPS for each selected application.", + unit="qps", + targets=[ + Target( + expr='sum(rate(ray_serve_num_http_error_requests_total{{application=~"$Application",application!~"",route=~"$HTTP_Route",route!~"/-/.*",{global_filters}}}[5m])) by (application, route)', + legend="{{application, route}}", + ), + Target( + expr='sum(rate(ray_serve_num_grpc_error_requests_total{{application=~"$Application",application!~"",method=~"$gRPC_Method",{global_filters}}}[5m])) by (application, method)', + legend="{{application, method}}", + ), + ], + grid_pos=GridPos(0, 1, 12, 8), + ), + Panel( + id=17, + title="Error QPS per application per error code", + description="Error QPS for each selected application.", + unit="qps", + targets=[ + Target( + expr='sum(rate(ray_serve_num_http_error_requests_total{{application=~"$Application",application!~"",route=~"$HTTP_Route",route!~"/-/.*",{global_filters}}}[5m])) by (application, route, error_code)', + legend="{{application, route, error_code}}", + ), + Target( + expr='sum(rate(ray_serve_num_grpc_error_requests_total{{application=~"$Application",application!~"",method=~"$gRPC_Method",{global_filters}}}[5m])) by (application, method, error_code)', + legend="{{application, method, error_code}}", + ), + ], + grid_pos=GridPos(12, 1, 12, 8), + ), + Panel( + id=12, + title="P50 latency per application", + description="P50 latency for selected applications.", + unit="ms", + targets=[ + Target( + expr='histogram_quantile(0.5, sum(rate(ray_serve_http_request_latency_ms_bucket{{application=~"$Application",application!~"",route=~"$HTTP_Route",route!~"/-/.*",{global_filters}}}[5m])) by (application, route, le))', + legend="{{application, route}}", + ), + Target( + expr='histogram_quantile(0.5, sum(rate(ray_serve_grpc_request_latency_ms_bucket{{application=~"$Application",application!~"",method=~"$gRPC_Method",{global_filters}}}[5m])) by (application, method, le))', + legend="{{application, method}}", + ), + Target( + expr='histogram_quantile(0.5, sum(rate({{__name__=~ "ray_serve_(http|grpc)_request_latency_ms_bucket",application=~"$Application",application!~"",{global_filters}}}[5m])) by (le))', + legend="Total", + ), + ], + fill=0, + stack=False, + grid_pos=GridPos(0, 2, 8, 8), + ), + Panel( + id=15, + title="P90 latency per application", + description="P90 latency for selected applications.", + unit="ms", + targets=[ + Target( + expr='histogram_quantile(0.9, sum(rate(ray_serve_http_request_latency_ms_bucket{{application=~"$Application",application!~"",route=~"$HTTP_Route",route!~"/-/.*",{global_filters}}}[5m])) by (application, route, le))', + legend="{{application, route}}", + ), + Target( + expr='histogram_quantile(0.9, sum(rate(ray_serve_grpc_request_latency_ms_bucket{{application=~"$Application",application!~"",method=~"$gRPC_Method",{global_filters}}}[5m])) by (application, method, le))', + legend="{{application, method}}", + ), + Target( + expr='histogram_quantile(0.9, sum(rate({{__name__=~ "ray_serve_(http|grpc)_request_latency_ms_bucket|ray_serve_grpc_request_latency_ms_bucket",application=~"$Application",application!~"",{global_filters}}}[5m])) by (le))', + legend="Total", + ), + ], + fill=0, + stack=False, + grid_pos=GridPos(8, 2, 8, 8), + ), + Panel( + id=16, + title="P99 latency per application", + description="P99 latency for selected applications.", + unit="ms", + targets=[ + Target( + expr='histogram_quantile(0.99, sum(rate(ray_serve_http_request_latency_ms_bucket{{application=~"$Application",application!~"",route=~"$HTTP_Route",route!~"/-/.*",{global_filters}}}[5m])) by (application, route, le))', + legend="{{application, route}}", + ), + Target( + expr='histogram_quantile(0.99, sum(rate(ray_serve_grpc_request_latency_ms_bucket{{application=~"$Application",application!~"",method=~"$gRPC_Method",{global_filters}}}[5m])) by (application, method, le))', + legend="{{application, method}}", + ), + Target( + expr='histogram_quantile(0.99, sum(rate({{__name__=~ "ray_serve_(http|grpc)_request_latency_ms_bucket|ray_serve_grpc_request_latency_ms_bucket",application=~"$Application",application!~"",{global_filters}}}[5m])) by (le))', + legend="Total", + ), + ], + fill=0, + stack=False, + grid_pos=GridPos(16, 2, 8, 8), + ), + Panel( + id=2, + title="Replicas per deployment", + description='Number of replicas per deployment. Ignores "Application" variable.', + unit="replicas", + targets=[ + Target( + expr="sum(ray_serve_deployment_replica_healthy{{{global_filters}}}) by (application, deployment)", + legend="{{application, deployment}}", + ), + ], + grid_pos=GridPos(0, 3, 8, 8), + ), + Panel( + id=13, + title="QPS per deployment", + description="QPS for each deployment.", + unit="qps", + targets=[ + Target( + expr='sum(rate(ray_serve_deployment_request_counter_total{{application=~"$Application",application!~"",{global_filters}}}[5m])) by (application, deployment)', + legend="{{application, deployment}}", + ), + ], + grid_pos=GridPos(8, 3, 8, 8), + ), + Panel( + id=14, + title="Error QPS per deployment", + description="Error QPS for each deplyoment.", + unit="qps", + targets=[ + Target( + expr='sum(rate(ray_serve_deployment_error_counter_total{{application=~"$Application",application!~"",{global_filters}}}[5m])) by (application, deployment)', + legend="{{application, deployment}}", + ), + ], + grid_pos=GridPos(16, 3, 8, 8), + ), + Panel( + id=9, + title="P50 latency per deployment", + description="P50 latency per deployment.", + unit="ms", + targets=[ + Target( + expr='histogram_quantile(0.5, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{{application=~"$Application",application!~"",{global_filters}}}[5m])) by (application, deployment, le))', + legend="{{application, deployment}}", + ), + Target( + expr='histogram_quantile(0.5, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{{application=~"$Application",application!~"",{global_filters}}}[5m])) by (le))', + legend="Total", + ), + ], + fill=0, + stack=False, + grid_pos=GridPos(0, 4, 8, 8), + ), + Panel( + id=10, + title="P90 latency per deployment", + description="P90 latency per deployment.", + unit="ms", + targets=[ + Target( + expr='histogram_quantile(0.9, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{{application=~"$Application",application!~"",{global_filters}}}[5m])) by (application, deployment, le))', + legend="{{application, deployment}}", + ), + Target( + expr='histogram_quantile(0.9, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{{application=~"$Application",application!~"",{global_filters}}}[5m])) by (le))', + legend="Total", + ), + ], + fill=0, + stack=False, + grid_pos=GridPos(8, 4, 8, 8), + ), + Panel( + id=11, + title="P99 latency per deployment", + description="P99 latency per deployment.", + unit="ms", + targets=[ + Target( + expr='histogram_quantile(0.99, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{{application=~"$Application",application!~"",{global_filters}}}[5m])) by (application, deployment, le))', + legend="{{application, deployment}}", + ), + Target( + expr='histogram_quantile(0.99, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{{application=~"$Application",application!~"",{global_filters}}}[5m])) by (le))', + legend="Total", + ), + ], + fill=0, + stack=False, + grid_pos=GridPos(16, 4, 8, 8), + ), + Panel( + id=3, + title="Queue size per deployment", + description='Number of requests queued per deployment. Ignores "Application" variable.', + unit="requests", + targets=[ + Target( + expr="sum(ray_serve_deployment_queued_queries{{{global_filters}}}) by (application, deployment)", + legend="{{application, deployment}}", + ), + ], + fill=0, + stack=False, + grid_pos=GridPos(0, 5, 8, 8), + ), + Panel( + id=4, + title="Node count", + description='Number of nodes in this cluster. Ignores "Application" variable.', + unit="nodes", + targets=[ + # TODO(aguo): Update this to use autoscaler metrics instead + Target( + expr="sum(autoscaler_active_nodes{{{global_filters}}}) by (NodeType)", + legend="Active Nodes: {{NodeType}}", + ), + Target( + expr="sum(autoscaler_recently_failed_nodes{{{global_filters}}}) by (NodeType)", + legend="Failed Nodes: {{NodeType}}", + ), + Target( + expr="sum(autoscaler_pending_nodes{{{global_filters}}}) by (NodeType)", + legend="Pending Nodes: {{NodeType}}", + ), + ], + grid_pos=GridPos(8, 5, 8, 8), + ), + Panel( + id=6, + title="Node network", + description='Network speed per node. Ignores "Application" variable.', + unit="Bps", + targets=[ + Target( + expr="sum(ray_node_network_receive_speed{{{global_filters}}}) by (instance)", + legend="Recv: {{instance}}", + ), + Target( + expr="sum(ray_node_network_send_speed{{{global_filters}}}) by (instance)", + legend="Send: {{instance}}", + ), + ], + fill=1, + linewidth=2, + stack=False, + grid_pos=GridPos(16, 5, 8, 8), + ), + Panel( + id=20, + title="Ongoing HTTP Requests", + description="The number of ongoing requests in the HTTP Proxy.", + unit="requests", + targets=[ + Target( + expr="ray_serve_num_ongoing_http_requests{{{global_filters}}}", + legend="Ongoing HTTP Requests", + ), + ], + grid_pos=GridPos(0, 6, 8, 8), + ), + Panel( + id=21, + title="Ongoing gRPC Requests", + description="The number of ongoing requests in the gRPC Proxy.", + unit="requests", + targets=[ + Target( + expr="ray_serve_num_ongoing_grpc_requests{{{global_filters}}}", + legend="Ongoing gRPC Requests", + ), + ], + grid_pos=GridPos(8, 6, 8, 8), + ), + Panel( + id=22, + title="Scheduling Tasks", + description="The number of request scheduling tasks in the router.", + unit="tasks", + targets=[ + Target( + expr="ray_serve_num_scheduling_tasks{{{global_filters}}}", + legend="Scheduling Tasks", + ), + ], + grid_pos=GridPos(16, 6, 8, 8), + ), + Panel( + id=23, + title="Scheduling Tasks in Backoff", + description="The number of request scheduling tasks in the router that are undergoing backoff.", + unit="tasks", + targets=[ + Target( + expr="ray_serve_num_scheduling_tasks_in_backoff{{{global_filters}}}", + legend="Scheduling Tasks in Backoff", + ), + ], + grid_pos=GridPos(0, 7, 8, 8), + ), + Panel( + id=24, + title="Controller Control Loop Duration", + description="The duration of the last control loop.", + unit="seconds", + targets=[ + Target( + expr="ray_serve_controller_control_loop_duration_s{{{global_filters}}}", + legend="Control Loop Duration", + ), + ], + grid_pos=GridPos(8, 7, 8, 8), + ), + Panel( + id=25, + title="Number of Control Loops", + description="The number of control loops performed by the controller. Increases monotonically over the controller's lifetime.", + unit="loops", + targets=[ + Target( + expr="ray_serve_controller_num_control_loops{{{global_filters}}}", + legend="Control Loops", + ), + ], + grid_pos=GridPos(16, 7, 8, 8), + ), +] + +ids = [] +for panel in SERVE_GRAFANA_PANELS: + ids.append(panel.id) +assert len(ids) == len( + set(ids) +), f"Duplicated id found. Use unique id for each panel. {ids}" + +serve_dashboard_config = DashboardConfig( + name="SERVE", + default_uid="rayServeDashboard", + panels=SERVE_GRAFANA_PANELS, + standard_global_filters=[ + 'ray_io_cluster=~"$Cluster"', + ], + base_json_file_name="serve_grafana_dashboard_base.json", +) diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/dashboards/serve_deployment_dashboard_panels.py b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/dashboards/serve_deployment_dashboard_panels.py new file mode 100644 index 0000000000000000000000000000000000000000..9d0dfe726a7372cb7647c6b5f94351c3c794d4b0 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/dashboards/serve_deployment_dashboard_panels.py @@ -0,0 +1,259 @@ +# ruff: noqa: E501 + +from ray.dashboard.modules.metrics.dashboards.common import ( + DashboardConfig, + GridPos, + Panel, + Target, +) + +SERVE_DEPLOYMENT_GRAFANA_PANELS = [ + Panel( + id=1, + title="Replicas per deployment", + description='Number of replicas per deployment. Ignores "Route" variable.', + unit="replicas", + targets=[ + Target( + expr="sum(ray_serve_deployment_replica_healthy{{{global_filters}}}) by (application, deployment)", + legend="{{application, deployment}}", + ), + ], + grid_pos=GridPos(0, 0, 8, 8), + ), + Panel( + id=2, + title="QPS per replica", + description="QPS for each replica.", + unit="qps", + targets=[ + Target( + expr='sum(rate(ray_serve_deployment_request_counter_total{{route=~"$Route",route!~"/-/.*",{global_filters}}}[5m])) by (application, deployment, replica)', + legend="{{replica}}", + ), + ], + grid_pos=GridPos(8, 0, 8, 8), + ), + Panel( + id=3, + title="Error QPS per replica", + description="Error QPS for each replica.", + unit="qps", + targets=[ + Target( + expr='sum(rate(ray_serve_deployment_error_counter_total{{route=~"$Route",route!~"/-/.*",{global_filters}}}[5m])) by (application, deployment, replica)', + legend="{{replica}}", + ), + ], + grid_pos=GridPos(16, 0, 8, 8), + ), + Panel( + id=4, + title="P50 latency per replica", + description="P50 latency per replica.", + unit="ms", + targets=[ + Target( + expr='histogram_quantile(0.5, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{{route=~"$Route",route!~"/-/.*",{global_filters}}}[5m])) by (application, deployment, replica, le))', + legend="{{replica}}", + ), + Target( + expr='histogram_quantile(0.5, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{{route=~"$Route",route!~"/-/.*",{global_filters}}}[5m])) by (le))', + legend="Total", + ), + ], + fill=0, + stack=False, + grid_pos=GridPos(0, 1, 8, 8), + ), + Panel( + id=5, + title="P90 latency per replica", + description="P90 latency per replica.", + unit="ms", + targets=[ + Target( + expr='histogram_quantile(0.9, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{{route=~"$Route",route!~"/-/.*",{global_filters}}}[5m])) by (application, deployment, replica, le))', + legend="{{replica}}", + ), + Target( + expr='histogram_quantile(0.9, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{{route=~"$Route",route!~"/-/.*",{global_filters}}}[5m])) by (le))', + legend="Total", + ), + ], + fill=0, + stack=False, + grid_pos=GridPos(8, 1, 8, 8), + ), + Panel( + id=6, + title="P99 latency per replica", + description="P99 latency per replica.", + unit="ms", + targets=[ + Target( + expr='histogram_quantile(0.99, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{{route=~"$Route",route!~"/-/.*",{global_filters}}}[5m])) by (application, deployment, replica, le))', + legend="{{replica}}", + ), + Target( + expr='histogram_quantile(0.99, sum(rate(ray_serve_deployment_processing_latency_ms_bucket{{route=~"$Route",{global_filters}}}[5m])) by (le))', + legend="Total", + ), + ], + fill=0, + stack=False, + grid_pos=GridPos(16, 1, 8, 8), + ), + Panel( + id=7, + title="Queue size per deployment", + description='Number of requests queued per deployment. Ignores "Replica" and "Route" variable.', + unit="requests", + targets=[ + Target( + expr="sum(ray_serve_deployment_queued_queries{{{global_filters}}}) by (application, deployment)", + legend="{{application, deployment}}", + ), + ], + fill=0, + stack=False, + grid_pos=GridPos(0, 2, 12, 8), + ), + Panel( + id=8, + title="Running requests per replica", + description="Current running requests for each replica.", + unit="requests", + targets=[ + Target( + expr="sum(ray_serve_replica_processing_queries{{{global_filters}}}) by (application, deployment, replica)", + legend="{{replica}}", + ), + ], + fill=0, + stack=False, + grid_pos=GridPos(12, 2, 12, 8), + ), + Panel( + id=9, + title="Multiplexed models per replica", + description="The number of multiplexed models for each replica.", + unit="models", + targets=[ + Target( + expr="sum(ray_serve_num_multiplexed_models{{{global_filters}}}) by (application, deployment, replica)", + legend="{{replica}}", + ), + ], + fill=0, + stack=False, + grid_pos=GridPos(0, 3, 8, 8), + ), + Panel( + id=10, + title="Multiplexed model loads per replica", + description="The number of times of multiplexed models loaded for each replica.", + unit="times", + targets=[ + Target( + expr="sum(ray_serve_multiplexed_models_load_counter_total{{{global_filters}}}) by (application, deployment, replica)", + legend="{{replica}}", + ), + ], + fill=0, + stack=False, + grid_pos=GridPos(8, 3, 8, 8), + ), + Panel( + id=11, + title="Multiplexed model unloads per replica", + description="The number of times of multiplexed models unloaded for each replica.", + unit="times", + targets=[ + Target( + expr="sum(ray_serve_multiplexed_models_unload_counter_total{{{global_filters}}}) by (application, deployment, replica)", + legend="{{replica}}", + ), + ], + fill=0, + stack=False, + grid_pos=GridPos(16, 3, 8, 8), + ), + Panel( + id=12, + title="P99 latency of multiplexed model loads per replica", + description="P99 latency of mutliplexed model load per replica.", + unit="ms", + targets=[ + Target( + expr="histogram_quantile(0.99, sum(rate(ray_serve_multiplexed_model_load_latency_ms_bucket{{{global_filters}}}[5m])) by (application, deployment, replica, le))", + legend="{{replica}}", + ), + ], + fill=0, + stack=False, + grid_pos=GridPos(0, 4, 8, 8), + ), + Panel( + id=13, + title="P99 latency of multiplexed model unloads per replica", + description="P99 latency of mutliplexed model unload per replica.", + unit="ms", + targets=[ + Target( + expr="histogram_quantile(0.99, sum(rate(ray_serve_multiplexed_model_unload_latency_ms_bucket{{{global_filters}}}[5m])) by (application, deployment, replica, le))", + legend="{{replica}}", + ), + ], + fill=0, + stack=False, + grid_pos=GridPos(8, 4, 8, 8), + ), + Panel( + id=14, + title="Multiplexed model ids per replica", + description="The ids of multiplexed models for each replica.", + unit="model", + targets=[ + Target( + expr="ray_serve_registered_multiplexed_model_id{{{global_filters}}}", + legend="{{replica}}:{{model_id}}", + ), + ], + grid_pos=GridPos(16, 4, 8, 8), + stack=False, + ), + Panel( + id=15, + title="Multiplexed model cache hit rate", + description="The cache hit rate of multiplexed models for the deployment.", + unit="%", + targets=[ + Target( + expr="(1 - sum(rate(ray_serve_multiplexed_models_load_counter_total{{{global_filters}}}[5m]))/sum(rate(ray_serve_multiplexed_get_model_requests_counter_total{{{global_filters}}}[5m])))", + legend="{{replica}}", + ), + ], + grid_pos=GridPos(0, 5, 8, 8), + ), +] + +ids = [] +for panel in SERVE_DEPLOYMENT_GRAFANA_PANELS: + ids.append(panel.id) +assert len(ids) == len( + set(ids) +), f"Duplicated id found. Use unique id for each panel. {ids}" + +serve_deployment_dashboard_config = DashboardConfig( + name="SERVE_DEPLOYMENT", + default_uid="rayServeDeploymentDashboard", + panels=SERVE_DEPLOYMENT_GRAFANA_PANELS, + standard_global_filters=[ + 'application=~"$Application"', + 'deployment=~"$Deployment"', + 'replica=~"$Replica"', + 'ray_io_cluster=~"$Cluster"', + ], + base_json_file_name="serve_deployment_grafana_dashboard_base.json", +) diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/dashboards/serve_deployment_grafana_dashboard_base.json b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/dashboards/serve_deployment_grafana_dashboard_base.json new file mode 100644 index 0000000000000000000000000000000000000000..e60e6e7fdb03e08242e56b4ab481d4393783707a --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/dashboards/serve_deployment_grafana_dashboard_base.json @@ -0,0 +1,223 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "iteration": 1667344411089, + "links": [], + "panels": [], + "refresh": false, + "schemaVersion": 27, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "current": { + "selected": false + }, + "description": "Filter queries to specific prometheus type.", + "hide": 2, + "includeAll": false, + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "allValue": ".*", + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": "${datasource}", + "definition": "label_values(ray_serve_deployment_replica_healthy{{{global_filters}}}, application)", + "description": null, + "error": null, + "hide": 0, + "includeAll": true, + "label": null, + "multi": true, + "name": "Application", + "options": [], + "query": { + "query": "label_values(ray_serve_deployment_replica_healthy{{{global_filters}}}, application)", + "refId": "Prometheus-Instance-Variable-Query" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".*", + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": "${datasource}", + "definition": "label_values(ray_serve_deployment_replica_healthy{{application=~\"$Application\",{global_filters}}}, deployment)", + "description": null, + "error": null, + "hide": 0, + "includeAll": true, + "label": null, + "multi": true, + "name": "Deployment", + "options": [], + "query": { + "query": "label_values(ray_serve_deployment_replica_healthy{{application=~\"$Application\",{global_filters}}}, deployment)", + "refId": "Prometheus-Instance-Variable-Query" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".*", + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": "${datasource}", + "definition": "label_values(ray_serve_deployment_replica_healthy{{application=~\"$Application\",deployment=~\"$Deployment\",{global_filters}}}, replica)", + "description": null, + "error": null, + "hide": 0, + "includeAll": true, + "label": null, + "multi": true, + "name": "Replica", + "options": [], + "query": { + "query": "label_values(ray_serve_deployment_replica_healthy{{application=~\"$Application\",deployment=~\"$Deployment\",{global_filters}}}, replica)", + "refId": "Prometheus-Instance-Variable-Query" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".*", + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": "${datasource}", + "definition": "label_values(ray_serve_deployment_request_counter{{deployment=~\"$Deployment\",{global_filters}}}, route)", + "description": null, + "error": null, + "hide": 0, + "includeAll": true, + "label": null, + "multi": true, + "name": "Route", + "options": [], + "query": { + "query": "label_values(ray_serve_deployment_request_counter{{deployment=~\"$Deployment\",{global_filters}}}, route)", + "refId": "Prometheus-Instance-Variable-Query" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "current": { + "selected": false + }, + "datasource": "${datasource}", + "definition": "label_values(ray_node_network_receive_speed{{{global_filters}}}, ray_io_cluster)", + "description": "Filter queries to specific Ray clusters for KubeRay. When ingesting metrics across multiple ray clusters, the ray_io_cluster label should be set per cluster. For KubeRay users, this is done automaticaly with Prometheus PodMonitor.", + "error": null, + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "Cluster", + "options": [], + "query": { + "query": "label_values(ray_node_network_receive_speed{{{global_filters}}}, ray_io_cluster)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 2, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "rayMeta": ["excludesSystemRoutes"], + "time": { + "from": "now-30m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Serve Deployment Dashboard", + "uid": "rayServeDeploymentDashboard", + "version": 1 +} diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/dashboards/serve_grafana_dashboard_base.json b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/dashboards/serve_grafana_dashboard_base.json new file mode 100644 index 0000000000000000000000000000000000000000..68c7b14bf0f706da92e8f13aba2ad910268c853c --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/dashboards/serve_grafana_dashboard_base.json @@ -0,0 +1,188 @@ +{ + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": "-- Grafana --", + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "name": "Annotations & Alerts", + "type": "dashboard" + } + ] + }, + "editable": true, + "gnetId": null, + "graphTooltip": 0, + "iteration": 1667344411089, + "links": [], + "panels": [], + "refresh": false, + "schemaVersion": 27, + "style": "dark", + "tags": [], + "templating": { + "list": [ + { + "current": { + "selected": false + }, + "description": "Filter queries of a specific Prometheus type.", + "hide": 2, + "includeAll": false, + "multi": false, + "name": "datasource", + "options": [], + "query": "prometheus", + "refresh": 1, + "regex": "", + "skipUrlSync": false, + "type": "datasource" + }, + { + "allValue": ".*", + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": "${datasource}", + "definition": "label_values(ray_serve_deployment_replica_healthy{{{global_filters}}}, application)", + "description": null, + "error": null, + "hide": 0, + "includeAll": true, + "label": null, + "multi": true, + "name": "Application", + "options": [], + "query": { + "query": "label_values(ray_serve_deployment_replica_healthy{{{global_filters}}}, application)", + "refId": "Prometheus-Instance-Variable-Query" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".*", + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": "${datasource}", + "definition": "label_values(ray_serve_num_http_requests_total{{{global_filters}}}, route)", + "description": null, + "error": null, + "hide": 0, + "includeAll": true, + "label": "HTTP Route", + "multi": true, + "name": "HTTP_Route", + "options": [], + "query": { + "query": "label_values(ray_serve_num_http_requests_total{{{global_filters}}}, route)", + "refId": "Prometheus-Instance-Variable-Query" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "allValue": ".*", + "current": { + "selected": true, + "text": [ + "All" + ], + "value": [ + "$__all" + ] + }, + "datasource": "${datasource}", + "definition": "label_values(ray_serve_num_grpc_requests{{{global_filters}}}, method)", + "description": null, + "error": null, + "hide": 0, + "includeAll": true, + "label": "gRPC Service Method", + "multi": true, + "name": "gRPC_Method", + "options": [], + "query": { + "query": "label_values(ray_serve_num_grpc_requests{{{global_filters}}}, method)", + "refId": "Prometheus-Instance-Variable-Query" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + }, + { + "current": { + "selected": false + }, + "datasource": "${datasource}", + "definition": "label_values(ray_node_network_receive_speed{{{global_filters}}}, ray_io_cluster)", + "description": "Filter queries to specific Ray clusters for KubeRay. When ingesting metrics across multiple ray clusters, the ray_io_cluster label should be set per cluster. For KubeRay users, this is done automaticaly with Prometheus PodMonitor.", + "error": null, + "hide": 0, + "includeAll": false, + "label": null, + "multi": false, + "name": "Cluster", + "options": [], + "query": { + "query": "label_values(ray_node_network_receive_speed{{{global_filters}}}, ray_io_cluster)", + "refId": "StandardVariableQuery" + }, + "refresh": 2, + "regex": "", + "skipUrlSync": false, + "sort": 2, + "tagValuesQuery": "", + "tags": [], + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "rayMeta": ["excludesSystemRoutes"], + "time": { + "from": "now-30m", + "to": "now" + }, + "timepicker": {}, + "timezone": "", + "title": "Serve Dashboard", + "uid": "rayServeDashboard", + "version": 1 +} diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/export/prometheus/prometheus.yml b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/export/prometheus/prometheus.yml new file mode 100644 index 0000000000000000000000000000000000000000..0f03553f6454ba86a42e9221dd2f80cbe50bd4ab --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/export/prometheus/prometheus.yml @@ -0,0 +1,12 @@ +# my global config +global: + scrape_interval: 10s # Set the scrape interval to every 10 seconds. Default is every 1 minute. + evaluation_interval: 10s # Evaluate rules every 10 seconds. The default is every 1 minute. + # scrape_timeout is set to the global default (10s). + +scrape_configs: +# Scrape from each Ray node as defined in the service_discovery.json provided by Ray. +- job_name: 'ray' + file_sd_configs: + - files: + - '/tmp/ray/prom_metrics_service_discovery.json' diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/grafana_dashboard_factory.py b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/grafana_dashboard_factory.py new file mode 100644 index 0000000000000000000000000000000000000000..8d328c7084012834f66261638e7552fcb823cfa3 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/grafana_dashboard_factory.py @@ -0,0 +1,301 @@ +import copy +import json +import os +from dataclasses import asdict +from typing import List, Tuple + +import ray +from ray.dashboard.modules.metrics.dashboards.common import DashboardConfig, Panel +from ray.dashboard.modules.metrics.dashboards.data_dashboard_panels import ( + data_dashboard_config, +) +from ray.dashboard.modules.metrics.dashboards.default_dashboard_panels import ( + default_dashboard_config, +) +from ray.dashboard.modules.metrics.dashboards.serve_dashboard_panels import ( + serve_dashboard_config, +) +from ray.dashboard.modules.metrics.dashboards.serve_deployment_dashboard_panels import ( + serve_deployment_dashboard_config, +) + +GRAFANA_DASHBOARD_UID_OVERRIDE_ENV_VAR_TEMPLATE = "RAY_GRAFANA_{name}_DASHBOARD_UID" +GRAFANA_DASHBOARD_GLOBAL_FILTERS_OVERRIDE_ENV_VAR_TEMPLATE = ( + "RAY_GRAFANA_{name}_DASHBOARD_GLOBAL_FILTERS" +) + +TARGET_TEMPLATE = { + "exemplar": True, + "expr": "0", + "interval": "", + "legendFormat": "", + "queryType": "randomWalk", + "refId": "A", +} + + +PANEL_TEMPLATE = { + "aliasColors": {}, + "bars": False, + "dashLength": 10, + "dashes": False, + "datasource": r"${datasource}", + "description": "", + "fieldConfig": {"defaults": {}, "overrides": []}, + "fill": 10, + "fillGradient": 0, + "gridPos": {"h": 8, "w": 12, "x": 0, "y": 0}, + "hiddenSeries": False, + "id": 26, + "legend": { + "alignAsTable": True, + "avg": False, + "current": True, + "hideEmpty": False, + "hideZero": True, + "max": False, + "min": False, + "rightSide": False, + "show": True, + "sort": "current", + "sortDesc": True, + "total": False, + "values": True, + }, + "lines": True, + "linewidth": 1, + "nullPointMode": "null", + "options": {"alertThreshold": True}, + "percentage": False, + "pluginVersion": "7.5.17", + "pointradius": 2, + "points": False, + "renderer": "flot", + "seriesOverrides": [ + { + "$$hashKey": "object:2987", + "alias": "MAX", + "dashes": True, + "color": "#1F60C4", + "fill": 0, + "stack": False, + }, + { + "$$hashKey": "object:78", + "alias": "/FINISHED|FAILED|DEAD|REMOVED|Failed Nodes:/", + "hiddenSeries": True, + }, + { + "$$hashKey": "object:2987", + "alias": "MAX + PENDING", + "dashes": True, + "color": "#777777", + "fill": 0, + "stack": False, + }, + ], + "spaceLength": 10, + "stack": True, + "steppedLine": False, + "targets": [], + "thresholds": [], + "timeFrom": None, + "timeRegions": [], + "timeShift": None, + "title": "", + "tooltip": {"shared": True, "sort": 0, "value_type": "individual"}, + "type": "graph", + "xaxis": { + "buckets": None, + "mode": "time", + "name": None, + "show": True, + "values": [], + }, + "yaxes": [ + { + "$$hashKey": "object:628", + "format": "units", + "label": "", + "logBase": 1, + "max": None, + "min": "0", + "show": True, + }, + { + "$$hashKey": "object:629", + "format": "short", + "label": None, + "logBase": 1, + "max": None, + "min": None, + "show": True, + }, + ], + "yaxis": {"align": False, "alignLevel": None}, +} + + +def _read_configs_for_dashboard( + dashboard_config: DashboardConfig, +) -> Tuple[str, List[str]]: + """ + Reads environment variable configs for overriding uid or global_filters for a given + dashboard. + + Returns: + Tuple with format uid, global_filters + """ + uid = ( + os.environ.get( + GRAFANA_DASHBOARD_UID_OVERRIDE_ENV_VAR_TEMPLATE.format( + name=dashboard_config.name + ) + ) + or dashboard_config.default_uid + ) + global_filters_str = ( + os.environ.get( + GRAFANA_DASHBOARD_GLOBAL_FILTERS_OVERRIDE_ENV_VAR_TEMPLATE.format( + name=dashboard_config.name + ) + ) + or "" + ) + global_filters = global_filters_str.split(",") + + return uid, global_filters + + +def generate_default_grafana_dashboard() -> Tuple[str, str]: + """ + Generates the dashboard output for the default dashboard and returns + both the content and the uid. + + Returns: + Tuple with format content, uid + """ + return _generate_grafana_dashboard(default_dashboard_config) + + +def generate_serve_grafana_dashboard() -> Tuple[str, str]: + """ + Generates the dashboard output for the serve dashboard and returns + both the content and the uid. + + Returns: + Tuple with format content, uid + """ + return _generate_grafana_dashboard(serve_dashboard_config) + + +def generate_serve_deployment_grafana_dashboard() -> Tuple[str, str]: + """ + Generates the dashboard output for the serve dashboard and returns + both the content and the uid. + + Returns: + Tuple with format content, uid + """ + return _generate_grafana_dashboard(serve_deployment_dashboard_config) + + +def generate_data_grafana_dashboard() -> Tuple[str, str]: + """ + Generates the dashboard output for the data dashboard and returns + both the content and the uid. + + Returns: + Tuple with format content, uid + """ + return _generate_grafana_dashboard(data_dashboard_config) + + +def _generate_grafana_dashboard(dashboard_config: DashboardConfig) -> str: + """ + Returns: + Tuple with format dashboard_content, uid + """ + uid, global_filters = _read_configs_for_dashboard(dashboard_config) + panels = _generate_grafana_panels(dashboard_config, global_filters) + base_file_name = dashboard_config.base_json_file_name + + base_json = json.load( + open(os.path.join(os.path.dirname(__file__), "dashboards", base_file_name)) + ) + base_json["panels"] = panels + # Update variables to use global_filters + global_filters_str = ",".join(global_filters) + variables = base_json.get("templating", {}).get("list", []) + for variable in variables: + if "definition" not in variable: + continue + variable["definition"] = variable["definition"].format( + global_filters=global_filters_str + ) + variable["query"]["query"] = variable["query"]["query"].format( + global_filters=global_filters_str + ) + + tags = base_json.get("tags", []) or [] + tags.append(f"rayVersion:{ray.__version__}") + base_json["tags"] = tags + base_json["uid"] = uid + # Ray metadata can be used to put arbitrary metadata + ray_meta = base_json.get("rayMeta", []) or [] + ray_meta.append("supportsGlobalFilterOverride") + base_json["rayMeta"] = ray_meta + return json.dumps(base_json, indent=4), uid + + +def _generate_grafana_panels( + config: DashboardConfig, global_filters: List[str] +) -> List[dict]: + out = [] + panel_global_filters = [*config.standard_global_filters, *global_filters] + for i, panel in enumerate(config.panels): + template = copy.deepcopy(PANEL_TEMPLATE) + template.update( + { + "title": panel.title, + "description": panel.description, + "id": panel.id, + "targets": _generate_targets(panel, panel_global_filters), + } + ) + if panel.grid_pos: + template["gridPos"] = asdict(panel.grid_pos) + else: + template["gridPos"]["y"] = i // 2 + template["gridPos"]["x"] = 12 * (i % 2) + template["yaxes"][0]["format"] = panel.unit + template["fill"] = panel.fill + template["stack"] = panel.stack + template["linewidth"] = panel.linewidth + out.append(template) + return out + + +def gen_incrementing_alphabets(length): + assert 65 + length < 96, "we only support up to 26 targets at a time." + # 65: ascii code of 'A'. + return list(map(chr, range(65, 65 + length))) + + +def _generate_targets(panel: Panel, panel_global_filters: List[str]) -> List[dict]: + targets = [] + for target, ref_id in zip( + panel.targets, gen_incrementing_alphabets(len(panel.targets)) + ): + template = copy.deepcopy(TARGET_TEMPLATE) + template.update( + { + "expr": target.expr.format( + global_filters=",".join(panel_global_filters) + ), + "legendFormat": target.legend, + "refId": ref_id, + } + ) + targets.append(template) + return targets diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/install_and_start_prometheus.py b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/install_and_start_prometheus.py new file mode 100644 index 0000000000000000000000000000000000000000..ea0ff2459f6570cdf46773041f7f820af45cb8d6 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/install_and_start_prometheus.py @@ -0,0 +1,204 @@ +import logging +import os +import platform +import subprocess +import sys +import tarfile +from pathlib import Path + +import requests + +from ray.dashboard.consts import PROMETHEUS_CONFIG_INPUT_PATH + +# Configure basic logging +logging.basicConfig( + level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" +) + +FALLBACK_PROMETHEUS_VERSION = "2.48.1" +DOWNLOAD_BLOCK_SIZE = 8192 # 8 KB +TEST_MODE_ENV_VAR = "RAY_PROMETHEUS_DOWNLOAD_TEST_MODE" + + +def get_system_info(): + os_type = platform.system().lower() + architecture = platform.machine() + if architecture == "x86_64": + # In the Prometheus filename, it's called amd64 + architecture = "amd64" + elif architecture == "aarch64": + # In the Prometheus filename, it's called arm64 + architecture = "arm64" + return os_type, architecture + + +def download_file(url, filename): + logging.info(f"Downloading {url} to {Path(filename).absolute()}...") + try: + test_mode = os.environ.get(TEST_MODE_ENV_VAR, False) + request_method = requests.head if test_mode else requests.get + response = request_method(url, stream=True) + response.raise_for_status() + + total_size_in_bytes = int(response.headers.get("content-length", 0)) + total_size_in_mb = total_size_in_bytes / (1024 * 1024) + + downloaded_size_in_mb = 0 + block_size = DOWNLOAD_BLOCK_SIZE + + with open(filename, "wb") as file: + for chunk in response.iter_content(chunk_size=block_size): + file.write(chunk) + downloaded_size_in_mb += len(chunk) / (1024 * 1024) + print( + f"Downloaded: {downloaded_size_in_mb:.2f} MB / " + f"{total_size_in_mb:.2f} MB", + end="\r", + ) + + print("\nDownload completed.") + return True + + except requests.RequestException as e: + logging.error(f"Error downloading file: {e}") + return False + + +def install_prometheus(file_path): + try: + with tarfile.open(file_path) as tar: + tar.extractall() + logging.info("Prometheus installed successfully.") + return True + except Exception as e: + logging.error(f"Error installing Prometheus: {e}") + return False + + +def start_prometheus(prometheus_dir): + + # The function assumes the Ray cluster to be monitored by Prometheus uses the + # default configuration with "/tmp/ray" as the default root temporary directory. + # + # This is to support the `ray metrics launch-prometheus` command, when a Ray cluster + # hasn't started yet and the user doesn't have a way to get a `--temp-dir` + # anywhere. So we choose to use a hardcoded default value. + + config_file = Path(PROMETHEUS_CONFIG_INPUT_PATH) + + if not config_file.exists(): + raise FileNotFoundError(f"Prometheus config file not found: {config_file}") + + prometheus_cmd = [ + f"{prometheus_dir}/prometheus", + "--config.file", + str(config_file), + "--web.enable-lifecycle", + ] + try: + process = subprocess.Popen(prometheus_cmd) + logging.info("Prometheus has started.") + return process + except Exception as e: + logging.error(f"Failed to start Prometheus: {e}") + return None + + +def print_shutdown_message(process_id): + message = ( + f"Prometheus is running with PID {process_id}.\n" + "To stop Prometheus, use the command: " + "`ray metrics shutdown-prometheus`, " + f"'kill {process_id}', or if you need to force stop, " + f"use 'kill -9 {process_id}'." + ) + print(message) + + debug_message = ( + "To list all processes running Prometheus, use the command: " + "'ps aux | grep prometheus'." + ) + print(debug_message) + + +def get_latest_prometheus_version(): + url = "https://api.github.com/repos/prometheus/prometheus/releases/latest" + try: + response = requests.get(url) + response.raise_for_status() + data = response.json() + # Remove the leading 'v' from the version number + return data["tag_name"].lstrip("v") + except requests.RequestException as e: + logging.error(f"Error fetching latest Prometheus version: {e}") + return None + + +def get_prometheus_filename(os_type=None, architecture=None, prometheus_version=None): + if os_type is None or architecture is None: + os_type, architecture = get_system_info() + + if prometheus_version is None: + prometheus_version = get_latest_prometheus_version() + if prometheus_version is None: + logging.warning( + "Failed to retrieve the latest Prometheus version. Falling " + f"back to {FALLBACK_PROMETHEUS_VERSION}." + ) + # Fall back to a hardcoded version + prometheus_version = FALLBACK_PROMETHEUS_VERSION + + return ( + f"prometheus-{prometheus_version}.{os_type}-{architecture}.tar.gz", + prometheus_version, + ) + + +def get_prometheus_download_url( + os_type=None, architecture=None, prometheus_version=None +): + file_name, prometheus_version = get_prometheus_filename( + os_type, architecture, prometheus_version + ) + return ( + "https://github.com/prometheus/prometheus/releases/" + f"download/v{prometheus_version}/{file_name}" + ) + + +def download_prometheus(os_type=None, architecture=None, prometheus_version=None): + file_name, _ = get_prometheus_filename(os_type, architecture, prometheus_version) + download_url = get_prometheus_download_url( + os_type, architecture, prometheus_version + ) + + return download_file(download_url, file_name), file_name + + +def main(): + logging.warning("This script is not intended for production use.") + + downloaded, file_name = download_prometheus() + if not downloaded: + logging.error("Failed to download Prometheus.") + sys.exit(1) + + # TODO: Verify the checksum of the downloaded file + + if not install_prometheus(file_name): + logging.error("Installation failed.") + sys.exit(1) + + # TODO: Add a check to see if Prometheus is already running + + assert file_name.endswith(".tar.gz") + process = start_prometheus( + # remove the .tar.gz extension + prometheus_dir=file_name.rstrip(".tar.gz") + ) + if process: + print_shutdown_message(process.pid) + + +if __name__ == "__main__": + main() diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/metrics_head.py b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/metrics_head.py new file mode 100644 index 0000000000000000000000000000000000000000..58a9b31a9cc40c963b00e582f5d309b5debe3f57 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/metrics_head.py @@ -0,0 +1,451 @@ +import asyncio +import json +import logging +import os +import shutil +from typing import Optional +from urllib.parse import quote + +import aiohttp + +import ray +import ray.dashboard.optional_utils as dashboard_optional_utils +import ray.dashboard.utils as dashboard_utils +from ray._private.async_utils import enable_monitor_loop_lag +from ray._private.ray_constants import ( + PROMETHEUS_SERVICE_DISCOVERY_FILE, + SESSION_LATEST, + env_integer, +) +from ray._private.utils import get_or_create_event_loop +from ray.dashboard.consts import AVAILABLE_COMPONENT_NAMES_FOR_METRICS +from ray.dashboard.modules.metrics.grafana_dashboard_factory import ( + generate_data_grafana_dashboard, + generate_default_grafana_dashboard, + generate_serve_deployment_grafana_dashboard, + generate_serve_grafana_dashboard, +) +from ray.dashboard.modules.metrics.templates import ( + DASHBOARD_PROVISIONING_TEMPLATE, + GRAFANA_DATASOURCE_TEMPLATE, + GRAFANA_INI_TEMPLATE, + PROMETHEUS_YML_TEMPLATE, +) + +import psutil + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + +routes = dashboard_optional_utils.DashboardHeadRouteTable + +routes = dashboard_optional_utils.DashboardHeadRouteTable + +METRICS_OUTPUT_ROOT_ENV_VAR = "RAY_METRICS_OUTPUT_ROOT" +METRICS_RECORD_INTERVAL_S = env_integer("METRICS_RECORD_INTERVAL_S", 5) + +DEFAULT_PROMETHEUS_HOST = "http://localhost:9090" +PROMETHEUS_HOST_ENV_VAR = "RAY_PROMETHEUS_HOST" +DEFAULT_PROMETHEUS_HEADERS = "{}" +PROMETHEUS_HEADERS_ENV_VAR = "RAY_PROMETHEUS_HEADERS" +DEFAULT_PROMETHEUS_NAME = "Prometheus" +PROMETHEUS_NAME_ENV_VAR = "RAY_PROMETHEUS_NAME" +PROMETHEUS_HEALTHCHECK_PATH = "-/healthy" + +DEFAULT_GRAFANA_HOST = "http://localhost:3000" +GRAFANA_HOST_ENV_VAR = "RAY_GRAFANA_HOST" +GRAFANA_HOST_DISABLED_VALUE = "DISABLED" +GRAFANA_IFRAME_HOST_ENV_VAR = "RAY_GRAFANA_IFRAME_HOST" +GRAFANA_DASHBOARD_OUTPUT_DIR_ENV_VAR = "RAY_METRICS_GRAFANA_DASHBOARD_OUTPUT_DIR" +GRAFANA_HEALTHCHECK_PATH = "api/health" + + +# parse_prom_headers will make sure the input is in one of the following formats: +# 1. {"H1": "V1", "H2": "V2"} +# 2. [["H1", "V1"], ["H2", "V2"], ["H2", "V3"]] +def parse_prom_headers(prometheus_headers): + parsed = json.loads(prometheus_headers) + if isinstance(parsed, dict): + if all(isinstance(k, str) and isinstance(v, str) for k, v in parsed.items()): + return parsed + if isinstance(parsed, list): + if all(len(e) == 2 and all(isinstance(v, str) for v in e) for e in parsed): + return parsed + raise ValueError( + f"{PROMETHEUS_HEADERS_ENV_VAR} should be a JSON string in one of the formats:\n" + + "1) An object with string keys and string values.\n" + + "2) an array of string arrays with 2 string elements each.\n" + + 'For example, {"H1": "V1", "H2": "V2"} and\n' + + '[["H1", "V1"], ["H2", "V2"], ["H2", "V3"]] are valid.' + ) + + +class PrometheusQueryError(Exception): + def __init__(self, status, message): + self.message = ( + "Error fetching data from prometheus. " + f"status: {status}, message: {message}" + ) + super().__init__(self.message) + + +class MetricsHead(dashboard_utils.DashboardHeadModule): + def __init__(self, config: dashboard_utils.DashboardHeadModuleConfig): + super().__init__(config) + self.grafana_host = os.environ.get(GRAFANA_HOST_ENV_VAR, DEFAULT_GRAFANA_HOST) + self.prometheus_host = os.environ.get( + PROMETHEUS_HOST_ENV_VAR, DEFAULT_PROMETHEUS_HOST + ) + default_metrics_root = os.path.join(self.session_dir, "metrics") + self.prometheus_headers = parse_prom_headers( + os.environ.get( + PROMETHEUS_HEADERS_ENV_VAR, + DEFAULT_PROMETHEUS_HEADERS, + ) + ) + session_latest_metrics_root = os.path.join( + self.temp_dir, SESSION_LATEST, "metrics" + ) + self._metrics_root = os.environ.get( + METRICS_OUTPUT_ROOT_ENV_VAR, default_metrics_root + ) + self._metrics_root_session_latest = os.environ.get( + METRICS_OUTPUT_ROOT_ENV_VAR, session_latest_metrics_root + ) + self._grafana_config_output_path = os.path.join(self._metrics_root, "grafana") + self._grafana_session_latest_config_output_path = os.path.join( + self._metrics_root_session_latest, "grafana" + ) + self._grafana_dashboard_output_dir = os.environ.get( + GRAFANA_DASHBOARD_OUTPUT_DIR_ENV_VAR, + os.path.join(self._grafana_config_output_path, "dashboards"), + ) + + self._prometheus_name = os.environ.get( + PROMETHEUS_NAME_ENV_VAR, DEFAULT_PROMETHEUS_NAME + ) + + # To be set later when dashboards gets generated + self._dashboard_uids = {} + + self._pid = os.getpid() + self._component = "dashboard" + assert self._component in AVAILABLE_COMPONENT_NAMES_FOR_METRICS + self._dashboard_proc = psutil.Process() + + self._event_loop_lag_s_max: Optional[float] = None + + @routes.get("/api/grafana_health") + async def grafana_health(self, req) -> aiohttp.web.Response: + """ + Endpoint that checks if Grafana is running + """ + # If disabled, we don't want to show the metrics tab at all. + if self.grafana_host == GRAFANA_HOST_DISABLED_VALUE: + return dashboard_optional_utils.rest_response( + success=True, + message="Grafana disabled", + grafana_host=GRAFANA_HOST_DISABLED_VALUE, + ) + + grafana_iframe_host = os.environ.get( + GRAFANA_IFRAME_HOST_ENV_VAR, self.grafana_host + ) + path = f"{self.grafana_host}/{GRAFANA_HEALTHCHECK_PATH}" + try: + async with self.http_session.get(path) as resp: + if resp.status != 200: + return dashboard_optional_utils.rest_response( + success=False, + message="Grafana healtcheck failed", + status=resp.status, + ) + json = await resp.json() + # Check if the required Grafana services are running. + if json["database"] != "ok": + return dashboard_optional_utils.rest_response( + success=False, + message="Grafana healtcheck failed. Database not ok.", + status=resp.status, + json=json, + ) + + return dashboard_optional_utils.rest_response( + success=True, + message="Grafana running", + grafana_host=grafana_iframe_host, + session_name=self.session_name, + dashboard_uids=self._dashboard_uids, + dashboard_datasource=self._prometheus_name, + ) + + except Exception as e: + logger.debug( + "Error fetching grafana endpoint. Is grafana running?", exc_info=e + ) + + return dashboard_optional_utils.rest_response( + success=False, message="Grafana healtcheck failed", exception=str(e) + ) + + @routes.get("/api/prometheus_health") + async def prometheus_health(self, req): + try: + path = f"{self.prometheus_host}/{PROMETHEUS_HEALTHCHECK_PATH}" + + async with self.http_session.get( + path, headers=self.prometheus_headers + ) as resp: + if resp.status != 200: + return dashboard_optional_utils.rest_response( + success=False, + message="prometheus healthcheck failed.", + status=resp.status, + ) + + return dashboard_optional_utils.rest_response( + success=True, + message="prometheus running", + ) + except Exception as e: + logger.debug( + "Error fetching prometheus endpoint. Is prometheus running?", exc_info=e + ) + return dashboard_optional_utils.rest_response( + success=False, message="prometheus healthcheck failed.", reason=str(e) + ) + + @staticmethod + def is_minimal_module(): + return False + + def _create_default_grafana_configs(self): + """ + Creates the Grafana configurations that are by default provided by Ray. + """ + # Create Grafana configuration folder + if os.path.exists(self._grafana_config_output_path): + shutil.rmtree(self._grafana_config_output_path) + os.makedirs(self._grafana_config_output_path, exist_ok=True) + + # Overwrite Grafana's configuration file + grafana_provisioning_folder = os.path.join( + self._grafana_config_output_path, "provisioning" + ) + grafana_prov_folder_with_latest_session = os.path.join( + self._grafana_session_latest_config_output_path, "provisioning" + ) + with open( + os.path.join( + self._grafana_config_output_path, + "grafana.ini", + ), + "w", + ) as f: + f.write( + GRAFANA_INI_TEMPLATE.format( + grafana_provisioning_folder=grafana_prov_folder_with_latest_session + ) + ) + + # Overwrite Grafana's dashboard provisioning directory based on env var + dashboard_provisioning_path = os.path.join( + grafana_provisioning_folder, "dashboards" + ) + os.makedirs( + dashboard_provisioning_path, + exist_ok=True, + ) + with open( + os.path.join( + dashboard_provisioning_path, + "default.yml", + ), + "w", + ) as f: + f.write( + DASHBOARD_PROVISIONING_TEMPLATE.format( + dashboard_output_folder=self._grafana_dashboard_output_dir + ) + ) + + # Overwrite Grafana's Prometheus datasource based on env var + prometheus_host = os.environ.get( + PROMETHEUS_HOST_ENV_VAR, DEFAULT_PROMETHEUS_HOST + ) + prometheus_headers = parse_prom_headers( + os.environ.get(PROMETHEUS_HEADERS_ENV_VAR, DEFAULT_PROMETHEUS_HEADERS) + ) + # parse_prom_headers will make sure the prometheus_headers is either format of: + # 1. {"H1": "V1", "H2": "V2"} or + # 2. [["H1", "V1"], ["H2", "V2"], ["H2", "V3"]] + prometheus_header_pairs = [] + if isinstance(prometheus_headers, list): + prometheus_header_pairs = prometheus_headers + elif isinstance(prometheus_headers, dict): + prometheus_header_pairs = [(k, v) for k, v in prometheus_headers.items()] + + data_sources_path = os.path.join(grafana_provisioning_folder, "datasources") + os.makedirs( + data_sources_path, + exist_ok=True, + ) + os.makedirs( + self._grafana_dashboard_output_dir, + exist_ok=True, + ) + with open( + os.path.join( + data_sources_path, + "default.yml", + ), + "w", + ) as f: + f.write( + GRAFANA_DATASOURCE_TEMPLATE( + prometheus_host=prometheus_host, + prometheus_name=self._prometheus_name, + jsonData={ + f"httpHeaderName{i+1}": header + for i, (header, _) in enumerate(prometheus_header_pairs) + }, + secureJsonData={ + f"httpHeaderValue{i+1}": value + for i, (_, value) in enumerate(prometheus_header_pairs) + }, + ) + ) + with open( + os.path.join( + self._grafana_dashboard_output_dir, + "default_grafana_dashboard.json", + ), + "w", + ) as f: + ( + content, + self._dashboard_uids["default"], + ) = generate_default_grafana_dashboard() + f.write(content) + with open( + os.path.join( + self._grafana_dashboard_output_dir, + "serve_grafana_dashboard.json", + ), + "w", + ) as f: + content, self._dashboard_uids["serve"] = generate_serve_grafana_dashboard() + f.write(content) + with open( + os.path.join( + self._grafana_dashboard_output_dir, + "serve_deployment_grafana_dashboard.json", + ), + "w", + ) as f: + ( + content, + self._dashboard_uids["serve_deployment"], + ) = generate_serve_deployment_grafana_dashboard() + f.write(content) + with open( + os.path.join( + self._grafana_dashboard_output_dir, + "data_grafana_dashboard.json", + ), + "w", + ) as f: + ( + content, + self._dashboard_uids["data"], + ) = generate_data_grafana_dashboard() + f.write(content) + + def _create_default_prometheus_configs(self): + """ + Creates the Prometheus configurations that are by default provided by Ray. + """ + prometheus_config_output_path = os.path.join( + self._metrics_root, "prometheus", "prometheus.yml" + ) + + # Generate the default Prometheus configurations + if os.path.exists(prometheus_config_output_path): + os.remove(prometheus_config_output_path) + os.makedirs(os.path.dirname(prometheus_config_output_path), exist_ok=True) + + # This code generates the Prometheus config based on the custom temporary root + # path set by the user at Ray cluster start up (via --temp-dir). In contrast, + # start_prometheus in install_and_start_prometheus.py uses a hardcoded + # Prometheus config at PROMETHEUS_CONFIG_INPUT_PATH that always uses "/tmp/ray". + # Other than the root path, the config file generated here is identical to that + # hardcoded config file. + prom_discovery_file_path = os.path.join( + self.temp_dir, PROMETHEUS_SERVICE_DISCOVERY_FILE + ) + with open(prometheus_config_output_path, "w") as f: + f.write( + PROMETHEUS_YML_TEMPLATE.format( + prom_metrics_service_discovery_file_path=prom_discovery_file_path + ) + ) + + @dashboard_utils.async_loop_forever(METRICS_RECORD_INTERVAL_S) + async def record_dashboard_metrics(self): + labels = { + "ip": self.ip, + "pid": self._pid, + "Version": ray.__version__, + "Component": self._component, + "SessionName": self.session_name, + } + self.metrics.metrics_dashboard_cpu.labels(**labels).set( + float(self._dashboard_proc.cpu_percent()) + ) + self.metrics.metrics_dashboard_mem_uss.labels(**labels).set( + float(self._dashboard_proc.memory_full_info().uss) / 1.0e6 + ) + self.metrics.metrics_dashboard_mem_rss.labels(**labels).set( + float(self._dashboard_proc.memory_full_info().rss) / 1.0e6 + ) + + loop = get_or_create_event_loop() + + self.metrics.metrics_event_loop_tasks.labels(**labels).set( + len(asyncio.all_tasks(loop)) + ) + + # Report the max lag since the last export, if any. + if self._event_loop_lag_s_max is not None: + self.metrics.metrics_event_loop_lag.labels(**labels).set( + float(self._event_loop_lag_s_max) + ) + self._event_loop_lag_s_max = None + + async def run(self, server): + self._create_default_grafana_configs() + self._create_default_prometheus_configs() + + def on_new_lag(lag_s): + # Record the lag. It's exported in `record_dashboard_metrics` + self._event_loop_lag_s_max = max(self._event_loop_lag_s_max or 0, lag_s) + + enable_monitor_loop_lag(on_new_lag) + + logger.info( + f"Generated prometheus and grafana configurations in: {self._metrics_root}" + ) + + await asyncio.gather(self.record_dashboard_metrics()) + + async def _query_prometheus(self, query): + async with self.http_session.get( + f"{self.prometheus_host}/api/v1/query?query={quote(query)}", + headers=self.prometheus_headers, + ) as resp: + if resp.status == 200: + prom_data = await resp.json() + return prom_data + + message = await resp.text() + raise PrometheusQueryError(resp.status, message) diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/templates.py b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/templates.py new file mode 100644 index 0000000000000000000000000000000000000000..c569c2513a0f8fb5c423bfb7a8b9b12d3bf379d1 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/metrics/templates.py @@ -0,0 +1,63 @@ +import yaml + +GRAFANA_INI_TEMPLATE = """ +[security] +allow_embedding = true + +[auth.anonymous] +enabled = true +org_name = Main Org. +org_role = Viewer + +[paths] +provisioning = {grafana_provisioning_folder} +""" + +DASHBOARD_PROVISIONING_TEMPLATE = """ +apiVersion: 1 + +providers: + - name: Ray # Default dashboards provided by OSS Ray + folder: Ray + type: file + options: + path: {dashboard_output_folder} +""" + + +def GRAFANA_DATASOURCE_TEMPLATE( + prometheus_name, prometheus_host, jsonData, secureJsonData +): + return yaml.safe_dump( + { + "apiVersion": 1, + "datasources": [ + { + "name": prometheus_name, + "url": prometheus_host, + "type": "prometheus", + "isDefault": True, + "access": "proxy", + "jsonData": jsonData, + "secureJsonData": secureJsonData, + } + ], + } + ) + + +PROMETHEUS_YML_TEMPLATE = """# my global config +global: + scrape_interval: 10s # Set the scrape interval to every 10 seconds. Default is every \ +1 minute. + evaluation_interval: 10s # Evaluate rules every 10 seconds. The default is every 1 \ +minute. + # scrape_timeout is set to the global default (10s). + +scrape_configs: +# Scrape from each Ray node as defined in the service_discovery.json provided by Ray. +- job_name: 'ray' + file_sd_configs: + - files: + - '{prom_metrics_service_discovery_file_path}' +""" diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/modules/reporter/__init__.py b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/reporter/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/modules/reporter/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/reporter/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ce0817690d8d259deafb4d4475db559fd0923a26 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/reporter/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/modules/reporter/__pycache__/profile_manager.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/reporter/__pycache__/profile_manager.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0d2961b5195254115f12ccc20643f2723b0727c1 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/reporter/__pycache__/profile_manager.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/modules/reporter/__pycache__/reporter_agent.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/reporter/__pycache__/reporter_agent.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2a4fdd331c62371bc625eddf9f2684e9e3a9b358 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/reporter/__pycache__/reporter_agent.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/modules/reporter/__pycache__/reporter_consts.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/reporter/__pycache__/reporter_consts.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1e3e257dda35d0dcfb5f5214eb7eb1c2f56cee01 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/reporter/__pycache__/reporter_consts.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/modules/reporter/__pycache__/reporter_head.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/reporter/__pycache__/reporter_head.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2158f9e96b82ff586a4c471e9224fb0d1a331f57 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/reporter/__pycache__/reporter_head.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/modules/reporter/profile_manager.py b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/reporter/profile_manager.py new file mode 100644 index 0000000000000000000000000000000000000000..5bf8adcaef08a90819b5862034725dacfac48383 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/reporter/profile_manager.py @@ -0,0 +1,373 @@ +import asyncio +import logging +import os +import shutil +import subprocess +import sys +from datetime import datetime +from pathlib import Path +from typing import Union + +logger = logging.getLogger(__name__) + +DARWIN_SET_CHOWN_CMD = "sudo chown root: `which {profiler}`" +LINUX_SET_CHOWN_CMD = "sudo chown root:root `which {profiler}`" + +PROFILER_PERMISSIONS_ERROR_MESSAGE = """ +Note that this command requires `{profiler}` to be installed with root permissions. You +can install `{profiler}` and give it root permissions as follows: + $ pip install {profiler} + $ {set_chown_command} + $ sudo chmod u+s `which {profiler}` + +Alternatively, you can start Ray with passwordless sudo / root permissions. + +""" + + +def decode(string: Union[str, bytes]): + if isinstance(string, bytes): + return string.decode("utf-8") + return string + + +def _format_failed_profiler_command(cmd, profiler, stdout, stderr) -> str: + stderr_str = decode(stderr) + extra_message = "" + + # If some sort of permission error returned, show a message about how + # to set up permissions correctly. + if "permission" in stderr_str.lower(): + set_chown_command = ( + DARWIN_SET_CHOWN_CMD.format(profiler=profiler) + if sys.platform == "darwin" + else LINUX_SET_CHOWN_CMD.format(profiler=profiler) + ) + extra_message = PROFILER_PERMISSIONS_ERROR_MESSAGE.format( + profiler=profiler, set_chown_command=set_chown_command + ) + + return f"""Failed to execute `{cmd}`. +{extra_message} +=== stderr === +{decode(stderr)} + +=== stdout === +{decode(stdout)} +""" + + +# If we can sudo, always try that. Otherwise, py-spy will only work if the user has +# root privileges or has configured setuid on the py-spy script. +async def _can_passwordless_sudo() -> bool: + try: + process = await asyncio.create_subprocess_exec( + "sudo", + "-n", + "true", + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + except FileNotFoundError: + return False + else: + _, _ = await process.communicate() + return process.returncode == 0 + + +class CpuProfilingManager: + def __init__(self, profile_dir_path: str): + self.profile_dir_path = Path(profile_dir_path) + self.profile_dir_path.mkdir(exist_ok=True) + self.profiler_name = "py-spy" + + async def trace_dump(self, pid: int, native: bool = False) -> (bool, str): + """ + Capture and dump a trace for a specified process. + + Args: + pid: The process ID (PID) of the target process for trace capture. + native (bool, optional): If True, includes native (C/C++) stack frames. + Default is False. + + Returns: + Tuple[bool, str]: A tuple containing a boolean indicating the success + of the trace capture operation and a string with the + trace data or an error message. + """ + pyspy = shutil.which(self.profiler_name) + if pyspy is None: + return False, "Failed to execute: py-spy is not installed" + + cmd = [pyspy, "dump", "-p", str(pid)] + # We + if sys.platform == "linux" and native: + cmd.append("--native") + if await _can_passwordless_sudo(): + cmd = ["sudo", "-n"] + cmd + process = await asyncio.create_subprocess_exec( + *cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + stdout, stderr = await process.communicate() + if process.returncode != 0: + return False, _format_failed_profiler_command( + cmd, self.profiler_name, stdout, stderr + ) + else: + return True, decode(stdout) + + async def cpu_profile( + self, pid: int, format="flamegraph", duration: float = 5, native: bool = False + ) -> (bool, str): + """ + Perform CPU profiling on a specified process. + + Args: + pid: The process ID (PID) of the target process to be profiled. + format (str, optional): The format of the CPU profile output. + Default is "flamegraph". + duration (float, optional): The duration of the profiling + session in seconds. Default is 5 seconds. + native (bool, optional): If True, includes native (C/C++) stack frames. + Default is False. + + Returns: + Tuple[bool, str]: A tuple containing a boolean indicating the success + of the profiling operation and a string with the + profile data or an error message. + """ + pyspy = shutil.which(self.profiler_name) + if pyspy is None: + return False, "Failed to execute: py-spy is not installed" + + if format not in ("flamegraph", "raw", "speedscope"): + return ( + False, + f"Failed to execute: Invalid format {format}, " + + "must be [flamegraph, raw, speedscope]", + ) + + if format == "flamegraph": + extension = "svg" + else: + extension = "txt" + profile_file_path = ( + self.profile_dir_path / f"{format}_{pid}_cpu_profiling.{extension}" + ) + cmd = [ + pyspy, + "record", + "-o", + profile_file_path, + "-p", + str(pid), + "-d", + str(duration), + "-f", + format, + ] + if sys.platform == "linux" and native: + cmd.append("--native") + if await _can_passwordless_sudo(): + cmd = ["sudo", "-n"] + cmd + process = await asyncio.create_subprocess_exec( + *cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + stdout, stderr = await process.communicate() + if process.returncode != 0: + return False, _format_failed_profiler_command( + cmd, self.profiler_name, stdout, stderr + ) + else: + return True, open(profile_file_path, "rb").read() + + +class MemoryProfilingManager: + def __init__(self, profile_dir_path: str): + self.profile_dir_path = Path(profile_dir_path) / "memray" + self.profile_dir_path.mkdir(exist_ok=True) + self.profiler_name = "memray" + + async def get_profile_result( + self, + pid: int, + profiler_filename: str, + format: str = "flamegraph", + leaks: bool = False, + ) -> (bool, str): + """ + Convert the Memray profile result to specified format. + + Args: + pid: The process ID (PID) associated with the profiling operation. + profiler_filename: The filename of the profiler output to + be processed. + format (str, optional): The format of the profile result. + Default is "flamegraph". + leaks (bool, optional): If True, include memory leak information in + the profile result. + + Returns: + Tuple[bool, str]: A tuple containing a boolean indicating the success + of the operation and a string with the processed profile result + or an error message. + """ + memray = shutil.which(self.profiler_name) + if memray is None: + return False, "Failed to execute: memray is not installed" + + profile_file_path = self.profile_dir_path / profiler_filename + if not Path(profile_file_path).is_file(): + return False, f"Failed to execute: process {pid} has not been profiled" + + profiler_name, _ = os.path.splitext(profiler_filename) + profile_visualize_path = self.profile_dir_path / f"{profiler_name}.html" + if format == "flamegraph": + visualize_cmd = [ + memray, + "flamegraph", + "-o", + profile_visualize_path, + "-f", + ] + elif format == "table": + visualize_cmd = [ + memray, + "table", + "-o", + profile_visualize_path, + "-f", + ] + else: + return ( + False, + f"Failed to execute: Report with format: {format} is not supported", + ) + + if leaks: + visualize_cmd.append("--leaks") + visualize_cmd.append(profile_file_path) + + process = await asyncio.create_subprocess_exec( + *visualize_cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + stdout, stderr = await process.communicate() + if process.returncode != 0: + return False, _format_failed_profiler_command( + visualize_cmd, self.profiler_name, stdout, stderr + ) + + return True, open(profile_visualize_path, "rb").read() + + async def attach_profiler( + self, + pid: int, + native: bool = False, + trace_python_allocators: bool = False, + verbose: bool = False, + ) -> (bool, str): + """ + Attach a Memray profiler to a specified process. + + Args: + pid: The process ID (PID) of the target process which + the profiler attached to. + native (bool, optional): If True, includes native (C/C++) stack frames. + Default is False. + trace_python_allocators (bool, optional): If True, includes Python + stack frames. Default is False. + verbose (bool, optional): If True, enables verbose output. + Default is False. + + Returns: + Tuple[bool, str]: A tuple containing a boolean indicating the success + of the operation and a string of a sucess message or an error message. + """ + memray = shutil.which(self.profiler_name) + if memray is None: + return False, None, "Failed to execute: memray is not installed" + + timestamp = datetime.now().strftime("%Y%m%d%H%M%S") + profiler_filename = f"{pid}_memory_profiling_{timestamp}.bin" + profile_file_path = self.profile_dir_path / profiler_filename + cmd = [memray, "attach", str(pid), "-o", profile_file_path] + + if native: + cmd.append("--native") + if trace_python_allocators: + cmd.append("--trace-python-allocators") + if verbose: + cmd.append("--verbose") + if await _can_passwordless_sudo(): + cmd = ["sudo", "-n"] + cmd + + process = await asyncio.create_subprocess_exec( + *cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + + stdout, stderr = await process.communicate() + if process.returncode != 0: + return ( + False, + None, + _format_failed_profiler_command( + cmd, self.profiler_name, stdout, stderr + ), + ) + else: + return ( + True, + profiler_filename, + f"Success attaching memray to process {pid}", + ) + + async def detach_profiler( + self, + pid: int, + verbose: bool = False, + ) -> (bool, str): + """ + Detach a profiler from a specified process. + + Args: + pid: The process ID (PID) of the target process the + profiler detached from. + verbose (bool, optional): If True, enables verbose output. + Default is False. + + Returns: + Tuple[bool, str]: A tuple containing a boolean indicating the success + of the operation and a string of a success message or an error message. + """ + memray = shutil.which(self.profiler_name) + if memray is None: + return False, "Failed to execute: memray is not installed" + + cmd = [memray, "detach"] + if verbose: + cmd.append("--verbose") + cmd.append(str(pid)) + + if await _can_passwordless_sudo(): + cmd = ["sudo", "-n"] + cmd + + process = await asyncio.create_subprocess_exec( + *cmd, + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + stdout, stderr = await process.communicate() + if process.returncode != 0: + return False, _format_failed_profiler_command( + cmd, self.profiler_name, stdout, stderr + ) + else: + return True, f"Success detaching memray from process {pid}" diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/modules/reporter/reporter_agent.py b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/reporter/reporter_agent.py new file mode 100644 index 0000000000000000000000000000000000000000..d58ed27fce40581e51a74024fccde28433a101e5 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/reporter/reporter_agent.py @@ -0,0 +1,1307 @@ +import asyncio +import datetime +import json +import logging +import os +import socket +import sys +import traceback +from collections import defaultdict +from concurrent.futures import ThreadPoolExecutor +from typing import List, Optional, Tuple, TypedDict, Union + +from opencensus.stats import stats as stats_module +from prometheus_client.core import REGISTRY + +import ray +import ray._private.prometheus_exporter as prometheus_exporter +import ray._private.services +import ray._private.utils +import ray.dashboard.modules.reporter.reporter_consts as reporter_consts +import ray.dashboard.utils as dashboard_utils +from ray._private import utils +from ray._private.metrics_agent import Gauge, MetricsAgent, Record +from ray._private.ray_constants import DEBUG_AUTOSCALING_STATUS, env_integer +from ray._raylet import WorkerID +from ray.core.generated import reporter_pb2, reporter_pb2_grpc +from ray.dashboard import k8s_utils +from ray.dashboard.consts import ( + CLUSTER_TAG_KEYS, + COMPONENT_METRICS_TAG_KEYS, + GCS_RPC_TIMEOUT_SECONDS, + GPU_TAG_KEYS, + NODE_TAG_KEYS, +) +from ray.dashboard.modules.reporter.profile_manager import ( + CpuProfilingManager, + MemoryProfilingManager, +) + +import psutil + +logger = logging.getLogger(__name__) + +enable_gpu_usage_check = True + +# Are we in a K8s pod? +IN_KUBERNETES_POD = "KUBERNETES_SERVICE_HOST" in os.environ +# Flag to enable showing disk usage when running in a K8s pod, +# disk usage defined as the result of running psutil.disk_usage("/") +# in the Ray container. +ENABLE_K8S_DISK_USAGE = os.environ.get("RAY_DASHBOARD_ENABLE_K8S_DISK_USAGE") == "1" +# Try to determine if we're in a container. +IN_CONTAINER = os.path.exists("/sys/fs/cgroup") +# Using existence of /sys/fs/cgroup as the criterion is consistent with +# Ray's existing resource logic, see e.g. ray._private.utils.get_num_cpus(). + +# NOTE: Executor in this head is intentionally constrained to just 1 thread by +# default to limit its concurrency, therefore reducing potential for +# GIL contention +RAY_DASHBOARD_REPORTER_AGENT_TPE_MAX_WORKERS = env_integer( + "RAY_DASHBOARD_REPORTER_AGENT_TPE_MAX_WORKERS", 1 +) + + +def recursive_asdict(o): + if isinstance(o, tuple) and hasattr(o, "_asdict"): + return recursive_asdict(o._asdict()) + + if isinstance(o, (tuple, list)): + L = [] + for k in o: + L.append(recursive_asdict(k)) + return L + + if isinstance(o, dict): + D = {k: recursive_asdict(v) for k, v in o.items()} + return D + + return o + + +def jsonify_asdict(o) -> str: + return json.dumps(dashboard_utils.to_google_style(recursive_asdict(o))) + + +# A list of gauges to record and export metrics. +METRICS_GAUGES = { + # CPU metrics + "node_cpu_utilization": Gauge( + "node_cpu_utilization", + "Total CPU usage on a ray node", + "percentage", + NODE_TAG_KEYS, + ), + "node_cpu_count": Gauge( + "node_cpu_count", + "Total CPUs available on a ray node", + "cores", + NODE_TAG_KEYS, + ), + # Memory metrics + "node_mem_used": Gauge( + "node_mem_used", + "Memory usage on a ray node", + "bytes", + NODE_TAG_KEYS, + ), + "node_mem_available": Gauge( + "node_mem_available", + "Memory available on a ray node", + "bytes", + NODE_TAG_KEYS, + ), + "node_mem_total": Gauge( + "node_mem_total", + "Total memory on a ray node", + "bytes", + NODE_TAG_KEYS, + ), + "node_mem_shared_bytes": Gauge( + "node_mem_shared_bytes", + "Total shared memory usage on a ray node", + "bytes", + NODE_TAG_KEYS, + ), + # GPU metrics + "node_gpus_available": Gauge( + "node_gpus_available", + "Total GPUs available on a ray node", + "percentage", + GPU_TAG_KEYS, + ), + "node_gpus_utilization": Gauge( + "node_gpus_utilization", + "Total GPUs usage on a ray node", + "percentage", + GPU_TAG_KEYS, + ), + "node_gram_used": Gauge( + "node_gram_used", + "Total GPU RAM usage on a ray node", + "bytes", + GPU_TAG_KEYS, + ), + "node_gram_available": Gauge( + "node_gram_available", + "Total GPU RAM available on a ray node", + "bytes", + GPU_TAG_KEYS, + ), + # Disk I/O metrics + "node_disk_io_read": Gauge( + "node_disk_io_read", + "Total read from disk", + "bytes", + NODE_TAG_KEYS, + ), + "node_disk_io_write": Gauge( + "node_disk_io_write", + "Total written to disk", + "bytes", + NODE_TAG_KEYS, + ), + "node_disk_io_read_count": Gauge( + "node_disk_io_read_count", + "Total read ops from disk", + "io", + NODE_TAG_KEYS, + ), + "node_disk_io_write_count": Gauge( + "node_disk_io_write_count", + "Total write ops to disk", + "io", + NODE_TAG_KEYS, + ), + "node_disk_io_read_speed": Gauge( + "node_disk_io_read_speed", + "Disk read speed", + "bytes/sec", + NODE_TAG_KEYS, + ), + "node_disk_io_write_speed": Gauge( + "node_disk_io_write_speed", + "Disk write speed", + "bytes/sec", + NODE_TAG_KEYS, + ), + "node_disk_read_iops": Gauge( + "node_disk_read_iops", + "Disk read iops", + "iops", + NODE_TAG_KEYS, + ), + "node_disk_write_iops": Gauge( + "node_disk_write_iops", + "Disk write iops", + "iops", + NODE_TAG_KEYS, + ), + # Disk usage metrics + "node_disk_usage": Gauge( + "node_disk_usage", + "Total disk usage (bytes) on a ray node", + "bytes", + NODE_TAG_KEYS, + ), + "node_disk_free": Gauge( + "node_disk_free", + "Total disk free (bytes) on a ray node", + "bytes", + NODE_TAG_KEYS, + ), + "node_disk_utilization_percentage": Gauge( + "node_disk_utilization_percentage", + "Total disk utilization (percentage) on a ray node", + "percentage", + NODE_TAG_KEYS, + ), + # Network metrics + "node_network_sent": Gauge( + "node_network_sent", + "Total network sent", + "bytes", + NODE_TAG_KEYS, + ), + "node_network_received": Gauge( + "node_network_received", + "Total network received", + "bytes", + NODE_TAG_KEYS, + ), + "node_network_send_speed": Gauge( + "node_network_send_speed", + "Network send speed", + "bytes/sec", + NODE_TAG_KEYS, + ), + "node_network_receive_speed": Gauge( + "node_network_receive_speed", + "Network receive speed", + "bytes/sec", + NODE_TAG_KEYS, + ), + # Component metrics + "component_cpu_percentage": Gauge( + "component_cpu_percentage", + "Total CPU usage of the components on a node.", + "percentage", + COMPONENT_METRICS_TAG_KEYS, + ), + "component_mem_shared_bytes": Gauge( + "component_mem_shared_bytes", + "SHM usage of all components of the node. " + "It is equivalent to the top command's SHR column.", + "bytes", + COMPONENT_METRICS_TAG_KEYS, + ), + "component_rss_mb": Gauge( + "component_rss_mb", + "RSS usage of all components on the node.", + "MB", + COMPONENT_METRICS_TAG_KEYS, + ), + "component_uss_mb": Gauge( + "component_uss_mb", + "USS usage of all components on the node.", + "MB", + COMPONENT_METRICS_TAG_KEYS, + ), + "component_num_fds": Gauge( + "component_num_fds", + "Number of open fds of all components on the node (Not available on Windows).", + "count", + COMPONENT_METRICS_TAG_KEYS, + ), + # Cluster metrics + "cluster_active_nodes": Gauge( + "cluster_active_nodes", + "Active nodes on the cluster", + "count", + CLUSTER_TAG_KEYS, + ), + "cluster_failed_nodes": Gauge( + "cluster_failed_nodes", + "Failed nodes on the cluster", + "count", + CLUSTER_TAG_KEYS, + ), + "cluster_pending_nodes": Gauge( + "cluster_pending_nodes", + "Pending nodes on the cluster", + "count", + CLUSTER_TAG_KEYS, + ), +} + +PSUTIL_PROCESS_ATTRS = ( + [ + "pid", + "create_time", + "cpu_percent", + "cpu_times", + "cmdline", + "memory_info", + "memory_full_info", + ] + + ["num_fds"] + if sys.platform != "win32" + else [] +) + +MB = 1024 * 1024 + +# Types +Percentage = int +Megabytes = int + + +# gpu utilization for nvidia gpu from a single process +class ProcessGPUInfo(TypedDict): + pid: int + gpu_memory_usage: Megabytes + + +# gpu utilization for nvidia gpu +class GpuUtilizationInfo(TypedDict): + index: int + name: str + uuid: str + utilization_gpu: Optional[Percentage] + memory_used: Megabytes + memory_total: Megabytes + processes_pids: Optional[List[ProcessGPUInfo]] + + +class ReporterAgent( + dashboard_utils.DashboardAgentModule, reporter_pb2_grpc.ReporterServiceServicer +): + """A monitor process for monitoring Ray nodes. + + Attributes: + dashboard_agent: The DashboardAgent object contains global config + """ + + def __init__(self, dashboard_agent): + """Initialize the reporter object.""" + super().__init__(dashboard_agent) + + if IN_KUBERNETES_POD or IN_CONTAINER: + # psutil does not give a meaningful logical cpu count when in a K8s pod, or + # in a container in general. + # Use ray._private.utils for this instead. + logical_cpu_count = ray._private.utils.get_num_cpus( + override_docker_cpu_warning=True + ) + # (Override the docker warning to avoid dashboard log spam.) + + # The dashboard expects a physical CPU count as well. + # This is not always meaningful in a container, but we will go ahead + # and give the dashboard what it wants using psutil. + physical_cpu_count = psutil.cpu_count(logical=False) + else: + logical_cpu_count = psutil.cpu_count() + physical_cpu_count = psutil.cpu_count(logical=False) + self._cpu_counts = (logical_cpu_count, physical_cpu_count) + self._gcs_aio_client = dashboard_agent.gcs_aio_client + self._ip = dashboard_agent.ip + self._log_dir = dashboard_agent.log_dir + self._is_head_node = self._ip == dashboard_agent.gcs_address.split(":")[0] + self._hostname = socket.gethostname() + # (pid, created_time) -> psutil.Process + self._workers = {} + # psutil.Process of the parent. + self._raylet_proc = None + # psutil.Process of the current process. + self._agent_proc = None + # The last reported worker proc names (e.g., ray::*). + self._latest_worker_proc_names = set() + self._network_stats_hist = [(0, (0.0, 0.0))] # time, (sent, recv) + self._disk_io_stats_hist = [ + (0, (0.0, 0.0, 0, 0)) + ] # time, (bytes read, bytes written, read ops, write ops) + self._metrics_collection_disabled = dashboard_agent.metrics_collection_disabled + self._metrics_agent = None + self._session_name = dashboard_agent.session_name + if not self._metrics_collection_disabled: + try: + stats_exporter = prometheus_exporter.new_stats_exporter( + prometheus_exporter.Options( + namespace="ray", + port=dashboard_agent.metrics_export_port, + address="127.0.0.1" if self._ip == "127.0.0.1" else "", + ) + ) + except Exception: + # TODO(SongGuyang): Catch the exception here because there is + # port conflict issue which brought from static port. We should + # remove this after we find better port resolution. + logger.exception( + "Failed to start prometheus stats exporter. Agent will stay " + "alive but disable the stats." + ) + stats_exporter = None + + self._metrics_agent = MetricsAgent( + stats_module.stats.view_manager, + stats_module.stats.stats_recorder, + stats_exporter, + ) + if self._metrics_agent.proxy_exporter_collector: + # proxy_exporter_collector is None + # if Prometheus server is not started. + REGISTRY.register(self._metrics_agent.proxy_exporter_collector) + self._key = ( + f"{reporter_consts.REPORTER_PREFIX}" f"{self._dashboard_agent.node_id}" + ) + + self._executor = ThreadPoolExecutor( + max_workers=RAY_DASHBOARD_REPORTER_AGENT_TPE_MAX_WORKERS, + thread_name_prefix="reporter_agent_executor", + ) + + async def GetTraceback(self, request, context): + pid = request.pid + native = request.native + p = CpuProfilingManager(self._log_dir) + success, output = await p.trace_dump(pid, native=native) + return reporter_pb2.GetTracebackReply(output=output, success=success) + + async def CpuProfiling(self, request, context): + pid = request.pid + duration = request.duration + format = request.format + native = request.native + p = CpuProfilingManager(self._log_dir) + success, output = await p.cpu_profile( + pid, format=format, duration=duration, native=native + ) + return reporter_pb2.CpuProfilingReply(output=output, success=success) + + async def MemoryProfiling(self, request, context): + pid = request.pid + format = request.format + leaks = request.leaks + duration = request.duration + native = request.native + trace_python_allocators = request.trace_python_allocators + p = MemoryProfilingManager(self._log_dir) + success, profiler_filename, output = await p.attach_profiler( + pid, native=native, trace_python_allocators=trace_python_allocators + ) + if not success: + return reporter_pb2.MemoryProfilingReply(output=output, success=success) + + # add 1 second sleep for memray overhead + await asyncio.sleep(duration + 1) + success, output = await p.detach_profiler(pid) + warning = None if success else output + success, output = await p.get_profile_result( + pid, profiler_filename=profiler_filename, format=format, leaks=leaks + ) + return reporter_pb2.MemoryProfilingReply( + output=output, success=success, warning=warning + ) + + async def ReportOCMetrics(self, request, context): + # Do nothing if metrics collection is disabled. + if self._metrics_collection_disabled: + return reporter_pb2.ReportOCMetricsReply() + + # This function receives a GRPC containing OpenCensus (OC) metrics + # from a Ray process, then exposes those metrics to Prometheus. + try: + worker_id = WorkerID(request.worker_id) + worker_id = None if worker_id.is_nil() else worker_id.hex() + self._metrics_agent.proxy_export_metrics(request.metrics, worker_id) + except Exception: + logger.error(traceback.format_exc()) + return reporter_pb2.ReportOCMetricsReply() + + @staticmethod + def _get_cpu_percent(in_k8s: bool): + if in_k8s: + return k8s_utils.cpu_percent() + else: + return psutil.cpu_percent() + + @staticmethod + def _get_gpu_usage(): + import ray._private.thirdparty.pynvml as pynvml + + global enable_gpu_usage_check + if not enable_gpu_usage_check: + return [] + gpu_utilizations = [] + + def decode(b: Union[str, bytes]) -> str: + if isinstance(b, bytes): + return b.decode("utf-8") # for python3, to unicode + return b + + try: + pynvml.nvmlInit() + except Exception as e: + logger.debug(f"pynvml failed to retrieve GPU information: {e}") + + # On machines without GPUs, pynvml.nvmlInit() can run subprocesses that + # spew to stderr. Then with log_to_driver=True, we get log spew from every + # single raylet. To avoid this, disable the GPU usage check on + # certain errors. + # https://github.com/ray-project/ray/issues/14305 + # https://github.com/ray-project/ray/pull/21686 + if type(e).__name__ == "NVMLError_DriverNotLoaded": + enable_gpu_usage_check = False + return gpu_utilizations + + num_gpus = pynvml.nvmlDeviceGetCount() + for i in range(num_gpus): + gpu_handle = pynvml.nvmlDeviceGetHandleByIndex(i) + memory_info = pynvml.nvmlDeviceGetMemoryInfo(gpu_handle) + utilization = None + try: + utilization_info = pynvml.nvmlDeviceGetUtilizationRates(gpu_handle) + utilization = int(utilization_info.gpu) + except pynvml.NVMLError as e: + logger.debug(f"pynvml failed to retrieve GPU utilization: {e}") + + # processes pids + processes_pids = None + try: + nv_comp_processes = pynvml.nvmlDeviceGetComputeRunningProcesses( + gpu_handle + ) + nv_graphics_processes = pynvml.nvmlDeviceGetGraphicsRunningProcesses( + gpu_handle + ) + processes_pids = [ + ProcessGPUInfo( + pid=int(nv_process.pid), + gpu_memory_usage=int(nv_process.usedGpuMemory) // MB + if nv_process.usedGpuMemory + else 0, + ) + for nv_process in (nv_comp_processes + nv_graphics_processes) + ] + except pynvml.NVMLError as e: + logger.debug(f"pynvml failed to retrieve GPU processes: {e}") + + info = GpuUtilizationInfo( + index=i, + name=decode(pynvml.nvmlDeviceGetName(gpu_handle)), + uuid=decode(pynvml.nvmlDeviceGetUUID(gpu_handle)), + utilization_gpu=utilization, + memory_used=int(memory_info.used) // MB, + memory_total=int(memory_info.total) // MB, + processes_pids=processes_pids, + ) + gpu_utilizations.append(info) + pynvml.nvmlShutdown() + + return gpu_utilizations + + @staticmethod + def _get_boot_time(): + if IN_KUBERNETES_POD: + # Return start time of container entrypoint + return psutil.Process(pid=1).create_time() + else: + return psutil.boot_time() + + @staticmethod + def _get_network_stats(): + ifaces = [ + v for k, v in psutil.net_io_counters(pernic=True).items() if k[0] == "e" + ] + + sent = sum((iface.bytes_sent for iface in ifaces)) + recv = sum((iface.bytes_recv for iface in ifaces)) + return sent, recv + + @staticmethod + def _get_mem_usage(): + total = ray._private.utils.get_system_memory() + used = ray._private.utils.get_used_memory() + available = total - used + percent = round(used / total, 3) * 100 + return total, available, percent, used + + @staticmethod + def _get_disk_usage(): + if IN_KUBERNETES_POD and not ENABLE_K8S_DISK_USAGE: + # If in a K8s pod, disable disk display by passing in dummy values. + return { + "/": psutil._common.sdiskusage(total=1, used=0, free=1, percent=0.0) + } + if sys.platform == "win32": + root = psutil.disk_partitions()[0].mountpoint + else: + root = os.sep + tmp = ray._private.utils.get_user_temp_dir() + return { + "/": psutil.disk_usage(root), + tmp: psutil.disk_usage(tmp), + } + + @staticmethod + def _get_disk_io_stats(): + stats = psutil.disk_io_counters() + # stats can be None or {} if the machine is diskless. + # https://psutil.readthedocs.io/en/latest/#psutil.disk_io_counters + if not stats: + return (0, 0, 0, 0) + else: + return ( + stats.read_bytes, + stats.write_bytes, + stats.read_count, + stats.write_count, + ) + + def _get_agent_proc(self) -> psutil.Process: + # Agent is the current process. + # This method is not necessary, but we have it for mock testing. + return psutil.Process() + + def _generate_worker_key(self, proc: psutil.Process) -> Tuple[int, float]: + return (proc.pid, proc.create_time()) + + def _get_workers(self): + raylet_proc = self._get_raylet_proc() + + if raylet_proc is None: + return [] + else: + workers = {} + if sys.platform == "win32": + # windows, get the child process not the runner + for child in raylet_proc.children(): + if child.children(): + child = child.children()[0] + workers[self._generate_worker_key(child)] = child + else: + workers = { + self._generate_worker_key(proc): proc + for proc in raylet_proc.children() + } + + # We should keep `raylet_proc.children()` in `self` because + # when `cpu_percent` is first called, it returns the meaningless 0. + # See more: https://github.com/ray-project/ray/issues/29848 + keys_to_pop = [] + # Add all new workers. + for key, worker in workers.items(): + if key not in self._workers: + self._workers[key] = worker + + # Pop out stale workers. + for key in self._workers: + if key not in workers: + keys_to_pop.append(key) + for k in keys_to_pop: + self._workers.pop(k) + + # Remove the current process (reporter agent), which is also a child of + # the Raylet. + self._workers.pop(self._generate_worker_key(self._get_agent_proc())) + + result = [] + for w in self._workers.values(): + try: + if w.status() == psutil.STATUS_ZOMBIE: + continue + except psutil.NoSuchProcess: + # the process may have terminated due to race condition. + continue + + result.append(w.as_dict(attrs=PSUTIL_PROCESS_ATTRS)) + return result + + def _get_raylet_proc(self): + try: + if not self._raylet_proc: + curr_proc = psutil.Process() + # The dashboard agent is a child of the raylet process. + # It is not necessarily the direct child (python-windows + # typically uses a py.exe runner to run python), so search + # up for a process named 'raylet' + candidate = curr_proc.parent() + while candidate: + if "raylet" in candidate.name(): + break + candidate = candidate.parent() + self._raylet_proc = candidate + + if self._raylet_proc is not None: + if self._raylet_proc.pid == 1: + return None + if self._raylet_proc.status() == psutil.STATUS_ZOMBIE: + return None + return self._raylet_proc + except (psutil.AccessDenied, ProcessLookupError): + pass + return None + + def _get_raylet(self): + raylet_proc = self._get_raylet_proc() + if raylet_proc is None: + return {} + else: + return raylet_proc.as_dict(attrs=PSUTIL_PROCESS_ATTRS) + + def _get_agent(self): + # Current proc == agent proc + if not self._agent_proc: + self._agent_proc = psutil.Process() + return self._agent_proc.as_dict(attrs=PSUTIL_PROCESS_ATTRS) + + def _get_load_avg(self): + if sys.platform == "win32": + cpu_percent = psutil.cpu_percent() + load = (cpu_percent, cpu_percent, cpu_percent) + else: + load = os.getloadavg() + if self._cpu_counts[0] > 0: + per_cpu_load = tuple((round(x / self._cpu_counts[0], 2) for x in load)) + else: + per_cpu_load = None + return load, per_cpu_load + + @staticmethod + def _compute_speed_from_hist(hist): + while len(hist) > 7: + hist.pop(0) + then, prev_stats = hist[0] + now, now_stats = hist[-1] + time_delta = now - then + return tuple((y - x) / time_delta for x, y in zip(prev_stats, now_stats)) + + def _get_shm_usage(self): + """Return the shm usage. + + If shm doesn't exist (e.g., MacOS), it returns None. + """ + mem = psutil.virtual_memory() + if not hasattr(mem, "shared"): + return None + return mem.shared + + def _collect_stats(self): + now = dashboard_utils.to_posix_time(datetime.datetime.utcnow()) + network_stats = self._get_network_stats() + self._network_stats_hist.append((now, network_stats)) + network_speed_stats = self._compute_speed_from_hist(self._network_stats_hist) + + disk_stats = self._get_disk_io_stats() + self._disk_io_stats_hist.append((now, disk_stats)) + disk_speed_stats = self._compute_speed_from_hist(self._disk_io_stats_hist) + + return { + "now": now, + "hostname": self._hostname, + "ip": self._ip, + "cpu": self._get_cpu_percent(IN_KUBERNETES_POD), + "cpus": self._cpu_counts, + "mem": self._get_mem_usage(), + # Unit is in bytes. None if + "shm": self._get_shm_usage(), + "workers": self._get_workers(), + "raylet": self._get_raylet(), + "agent": self._get_agent(), + "bootTime": self._get_boot_time(), + "loadAvg": self._get_load_avg(), + "disk": self._get_disk_usage(), + "disk_io": disk_stats, + "disk_io_speed": disk_speed_stats, + "gpus": self._get_gpu_usage(), + "network": network_stats, + "network_speed": network_speed_stats, + # Deprecated field, should be removed with frontend. + "cmdline": self._get_raylet().get("cmdline", []), + } + + def _generate_reseted_stats_record(self, component_name: str) -> List[Record]: + """Return a list of Record that will reset + the system metrics of a given component name. + + Args: + component_name: a component name for a given stats. + + Returns: + a list of Record instances of all values 0. + """ + tags = {"ip": self._ip, "Component": component_name} + + records = [] + records.append( + Record( + gauge=METRICS_GAUGES["component_cpu_percentage"], + value=0.0, + tags=tags, + ) + ) + records.append( + Record( + gauge=METRICS_GAUGES["component_mem_shared_bytes"], + value=0.0, + tags=tags, + ) + ) + records.append( + Record( + gauge=METRICS_GAUGES["component_rss_mb"], + value=0.0, + tags=tags, + ) + ) + records.append( + Record( + gauge=METRICS_GAUGES["component_uss_mb"], + value=0.0, + tags=tags, + ) + ) + records.append( + Record( + gauge=METRICS_GAUGES["component_num_fds"], + value=0, + tags=tags, + ) + ) + return records + + def _generate_system_stats_record( + self, stats: List[dict], component_name: str, pid: Optional[str] = None + ) -> List[Record]: + """Generate a list of Record class from a given component names. + + Args: + stats: a list of stats dict generated by `psutil.as_dict`. + If empty, it will create the metrics of a given "component_name" + which has all 0 values. + component_name: a component name for a given stats. + pid: optionally provided pids. + + Returns: + a list of Record class that will be exposed to Prometheus. + """ + total_cpu_percentage = 0.0 + total_rss = 0.0 + total_uss = 0.0 + total_shm = 0.0 + total_num_fds = 0 + + for stat in stats: + total_cpu_percentage += float(stat.get("cpu_percent", 0.0)) # noqa + memory_info = stat.get("memory_info") + if memory_info: + mem = stat["memory_info"] + total_rss += float(mem.rss) / 1.0e6 + if hasattr(mem, "shared"): + total_shm += float(mem.shared) + mem_full_info = stat.get("memory_full_info") + if mem_full_info is not None: + total_uss += float(mem_full_info.uss) / 1.0e6 + total_num_fds += int(stat.get("num_fds", 0)) + + tags = {"ip": self._ip, "Component": component_name} + if pid: + tags["pid"] = pid + + records = [] + records.append( + Record( + gauge=METRICS_GAUGES["component_cpu_percentage"], + value=total_cpu_percentage, + tags=tags, + ) + ) + records.append( + Record( + gauge=METRICS_GAUGES["component_mem_shared_bytes"], + value=total_shm, + tags=tags, + ) + ) + records.append( + Record( + gauge=METRICS_GAUGES["component_rss_mb"], + value=total_rss, + tags=tags, + ) + ) + if total_uss > 0.0: + records.append( + Record( + gauge=METRICS_GAUGES["component_uss_mb"], + value=total_uss, + tags=tags, + ) + ) + records.append( + Record( + gauge=METRICS_GAUGES["component_num_fds"], + value=total_num_fds, + tags=tags, + ) + ) + + return records + + def generate_worker_stats_record(self, worker_stats: List[dict]) -> List[Record]: + """Generate a list of Record class for worker proceses. + + This API automatically sets the component_name of record as + the name of worker processes. I.e., ray::* so that we can report + per task/actor (grouped by a func/class name) resource usages. + + Args: + stats: a list of stats dict generated by `psutil.as_dict` + for worker processes. + """ + # worekr cmd name (ray::*) -> stats dict. + proc_name_to_stats = defaultdict(list) + for stat in worker_stats: + cmdline = stat.get("cmdline") + # All ray processes start with ray:: + if cmdline and len(cmdline) > 0 and cmdline[0].startswith("ray::"): + proc_name = cmdline[0] + proc_name_to_stats[proc_name].append(stat) + # We will lose worker stats that don't follow the ray worker proc + # naming convention. Theoretically, there should be no data loss here + # because all worker processes are renamed to ray::. + + records = [] + for proc_name, stats in proc_name_to_stats.items(): + records.extend(self._generate_system_stats_record(stats, proc_name)) + + # Reset worker metrics that are from finished processes. + new_proc_names = set(proc_name_to_stats.keys()) + stale_procs = self._latest_worker_proc_names - new_proc_names + self._latest_worker_proc_names = new_proc_names + + for stale_proc_name in stale_procs: + records.extend(self._generate_reseted_stats_record(stale_proc_name)) + + return records + + def _to_records(self, stats, cluster_stats) -> List[Record]: + records_reported = [] + ip = stats["ip"] + is_head_node = str(self._is_head_node).lower() + + # Common tags for node-level metrics + node_tags = {"ip": ip, "IsHeadNode": is_head_node} + + # -- Instance count of cluster -- + # Only report cluster stats on head node + if "autoscaler_report" in cluster_stats and self._is_head_node: + active_nodes = cluster_stats["autoscaler_report"]["active_nodes"] + for node_type, active_node_count in active_nodes.items(): + records_reported.append( + Record( + gauge=METRICS_GAUGES["cluster_active_nodes"], + value=active_node_count, + tags={"node_type": node_type}, + ) + ) + + failed_nodes = cluster_stats["autoscaler_report"]["failed_nodes"] + failed_nodes_dict = {} + for node_ip, node_type in failed_nodes: + if node_type in failed_nodes_dict: + failed_nodes_dict[node_type] += 1 + else: + failed_nodes_dict[node_type] = 1 + + for node_type, failed_node_count in failed_nodes_dict.items(): + records_reported.append( + Record( + gauge=METRICS_GAUGES["cluster_failed_nodes"], + value=failed_node_count, + tags={"node_type": node_type}, + ) + ) + + pending_nodes = cluster_stats["autoscaler_report"]["pending_nodes"] + pending_nodes_dict = {} + for node_ip, node_type, status_message in pending_nodes: + if node_type in pending_nodes_dict: + pending_nodes_dict[node_type] += 1 + else: + pending_nodes_dict[node_type] = 1 + + for node_type, pending_node_count in pending_nodes_dict.items(): + records_reported.append( + Record( + gauge=METRICS_GAUGES["cluster_pending_nodes"], + value=pending_node_count, + tags={"node_type": node_type}, + ) + ) + + # -- CPU per node -- + cpu_usage = float(stats["cpu"]) + cpu_record = Record( + gauge=METRICS_GAUGES["node_cpu_utilization"], + value=cpu_usage, + tags=node_tags, + ) + + cpu_count, _ = stats["cpus"] + cpu_count_record = Record( + gauge=METRICS_GAUGES["node_cpu_count"], value=cpu_count, tags=node_tags + ) + + # -- Mem per node -- + mem_total, mem_available, _, mem_used = stats["mem"] + mem_used_record = Record( + gauge=METRICS_GAUGES["node_mem_used"], value=mem_used, tags=node_tags + ) + mem_available_record = Record( + gauge=METRICS_GAUGES["node_mem_available"], + value=mem_available, + tags=node_tags, + ) + mem_total_record = Record( + gauge=METRICS_GAUGES["node_mem_total"], value=mem_total, tags=node_tags + ) + + shm_used = stats["shm"] + if shm_used: + node_mem_shared = Record( + gauge=METRICS_GAUGES["node_mem_shared_bytes"], + value=shm_used, + tags=node_tags, + ) + records_reported.append(node_mem_shared) + + # The output example of GpuUtilizationInfo. + """ + {'index': 0, + 'uuid': 'GPU-36e1567d-37ed-051e-f8ff-df807517b396', + 'name': 'NVIDIA A10G', + 'utilization_gpu': 1, + 'memory_used': 0, + 'memory_total': 22731} + """ + # -- GPU per node -- + gpus = stats["gpus"] + gpus_available = len(gpus) + + if gpus_available: + for gpu in gpus: + gpus_utilization, gram_used, gram_total = 0, 0, 0 + # Consume GPU may not report its utilization. + if gpu["utilization_gpu"] is not None: + gpus_utilization += gpu["utilization_gpu"] + gram_used += gpu["memory_used"] + gram_total += gpu["memory_total"] + gpu_index = gpu.get("index") + gpu_name = gpu.get("name") + + gram_available = gram_total - gram_used + + if gpu_index is not None: + gpu_tags = {**node_tags, "GpuIndex": str(gpu_index)} + if gpu_name: + gpu_tags["GpuDeviceName"] = gpu_name + + # There's only 1 GPU per each index, so we record 1 here. + gpus_available_record = Record( + gauge=METRICS_GAUGES["node_gpus_available"], + value=1, + tags=gpu_tags, + ) + gpus_utilization_record = Record( + gauge=METRICS_GAUGES["node_gpus_utilization"], + value=gpus_utilization, + tags=gpu_tags, + ) + gram_used_record = Record( + gauge=METRICS_GAUGES["node_gram_used"], + value=gram_used, + tags=gpu_tags, + ) + gram_available_record = Record( + gauge=METRICS_GAUGES["node_gram_available"], + value=gram_available, + tags=gpu_tags, + ) + records_reported.extend( + [ + gpus_available_record, + gpus_utilization_record, + gram_used_record, + gram_available_record, + ] + ) + + # -- Disk per node -- + disk_io_stats = stats["disk_io"] + disk_read_record = Record( + gauge=METRICS_GAUGES["node_disk_io_read"], + value=disk_io_stats[0], + tags=node_tags, + ) + disk_write_record = Record( + gauge=METRICS_GAUGES["node_disk_io_write"], + value=disk_io_stats[1], + tags=node_tags, + ) + disk_read_count_record = Record( + gauge=METRICS_GAUGES["node_disk_io_read_count"], + value=disk_io_stats[2], + tags=node_tags, + ) + disk_write_count_record = Record( + gauge=METRICS_GAUGES["node_disk_io_write_count"], + value=disk_io_stats[3], + tags=node_tags, + ) + disk_io_speed_stats = stats["disk_io_speed"] + disk_read_speed_record = Record( + gauge=METRICS_GAUGES["node_disk_io_read_speed"], + value=disk_io_speed_stats[0], + tags=node_tags, + ) + disk_write_speed_record = Record( + gauge=METRICS_GAUGES["node_disk_io_write_speed"], + value=disk_io_speed_stats[1], + tags=node_tags, + ) + disk_read_iops_record = Record( + gauge=METRICS_GAUGES["node_disk_read_iops"], + value=disk_io_speed_stats[2], + tags=node_tags, + ) + disk_write_iops_record = Record( + gauge=METRICS_GAUGES["node_disk_write_iops"], + value=disk_io_speed_stats[3], + tags=node_tags, + ) + used = stats["disk"]["/"].used + free = stats["disk"]["/"].free + disk_utilization = float(used / (used + free)) * 100 + disk_usage_record = Record( + gauge=METRICS_GAUGES["node_disk_usage"], value=used, tags=node_tags + ) + disk_free_record = Record( + gauge=METRICS_GAUGES["node_disk_free"], value=free, tags=node_tags + ) + disk_utilization_percentage_record = Record( + gauge=METRICS_GAUGES["node_disk_utilization_percentage"], + value=disk_utilization, + tags=node_tags, + ) + + # -- Network speed (send/receive) stats per node -- + network_stats = stats["network"] + network_sent_record = Record( + gauge=METRICS_GAUGES["node_network_sent"], + value=network_stats[0], + tags=node_tags, + ) + network_received_record = Record( + gauge=METRICS_GAUGES["node_network_received"], + value=network_stats[1], + tags=node_tags, + ) + + # -- Network speed (send/receive) per node -- + network_speed_stats = stats["network_speed"] + network_send_speed_record = Record( + gauge=METRICS_GAUGES["node_network_send_speed"], + value=network_speed_stats[0], + tags=node_tags, + ) + network_receive_speed_record = Record( + gauge=METRICS_GAUGES["node_network_receive_speed"], + value=network_speed_stats[1], + tags=node_tags, + ) + + """ + Record system stats. + """ + + # Record component metrics. + raylet_stats = stats["raylet"] + if raylet_stats: + raylet_pid = str(raylet_stats["pid"]) + records_reported.extend( + self._generate_system_stats_record( + [raylet_stats], "raylet", pid=raylet_pid + ) + ) + workers_stats = stats["workers"] + records_reported.extend(self.generate_worker_stats_record(workers_stats)) + agent_stats = stats["agent"] + if agent_stats: + agent_pid = str(agent_stats["pid"]) + records_reported.extend( + self._generate_system_stats_record( + [agent_stats], "agent", pid=agent_pid + ) + ) + + # TODO(sang): Record GCS metrics. + # NOTE: Dashboard metrics is recorded within the dashboard because + # it can be deployed as a standalone instance. It shouldn't + # depend on the agent. + + records_reported.extend( + [ + cpu_record, + cpu_count_record, + mem_used_record, + mem_available_record, + mem_total_record, + disk_read_record, + disk_write_record, + disk_read_count_record, + disk_write_count_record, + disk_read_speed_record, + disk_write_speed_record, + disk_read_iops_record, + disk_write_iops_record, + disk_usage_record, + disk_free_record, + disk_utilization_percentage_record, + network_sent_record, + network_received_record, + network_send_speed_record, + network_receive_speed_record, + ] + ) + + return records_reported + + async def _run_loop(self, publisher): + """Get any changes to the log files and push updates to kv.""" + loop = utils.get_or_create_event_loop() + + while True: + try: + # Fetch autoscaler debug status + autoscaler_status_json_bytes: Optional[bytes] = None + if self._is_head_node: + autoscaler_status_json_bytes = ( + await self._gcs_aio_client.internal_kv_get( + DEBUG_AUTOSCALING_STATUS.encode(), + None, + timeout=GCS_RPC_TIMEOUT_SECONDS, + ) + ) + + # NOTE: Stats collection is executed inside the thread-pool + # executor (TPE) to avoid blocking the Agent's event-loop + json_payload = await loop.run_in_executor( + self._executor, + self._compose_stats_payload, + autoscaler_status_json_bytes, + ) + + await publisher.publish_resource_usage(self._key, json_payload) + + except Exception: + logger.exception("Error publishing node physical stats.") + + await asyncio.sleep(reporter_consts.REPORTER_UPDATE_INTERVAL_MS / 1000) + + def _compose_stats_payload( + self, cluster_autoscaling_stats_json: Optional[bytes] + ) -> str: + stats = self._collect_stats() + + # Report stats only when metrics collection is enabled. + if not self._metrics_collection_disabled: + cluster_stats = ( + json.loads(cluster_autoscaling_stats_json.decode()) + if cluster_autoscaling_stats_json + else {} + ) + + records = self._to_records(stats, cluster_stats) + + self._metrics_agent.record_and_export( + records, + global_tags={ + "Version": ray.__version__, + "SessionName": self._session_name, + }, + ) + + self._metrics_agent.clean_all_dead_worker_metrics() + + return jsonify_asdict(stats) + + async def run(self, server): + if server: + reporter_pb2_grpc.add_ReporterServiceServicer_to_server(self, server) + + await self._run_loop(self._dashboard_agent.publisher) + + @staticmethod + def is_minimal_module(): + return False diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/modules/reporter/reporter_consts.py b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/reporter/reporter_consts.py new file mode 100644 index 0000000000000000000000000000000000000000..b3254fd3e51e5c7aa4454033af1802d9f89a1217 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/reporter/reporter_consts.py @@ -0,0 +1,7 @@ +import ray._private.ray_constants as ray_constants + +REPORTER_PREFIX = "RAY_REPORTER:" +# The reporter will report its statistics this often (milliseconds). +REPORTER_UPDATE_INTERVAL_MS = ray_constants.env_integer( + "REPORTER_UPDATE_INTERVAL_MS", 5000 +) diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/modules/reporter/reporter_head.py b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/reporter/reporter_head.py new file mode 100644 index 0000000000000000000000000000000000000000..30295c805f0b3b0c29604f54839c44a58168fb4c --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/reporter/reporter_head.py @@ -0,0 +1,723 @@ +import asyncio +import json +import logging +from concurrent.futures import ThreadPoolExecutor +from typing import List, Optional, Tuple + +import aiohttp.web + +from ray import NodeID +import ray.dashboard.optional_utils as dashboard_optional_utils +import ray.dashboard.utils as dashboard_utils +from ray._private.metrics_agent import PrometheusServiceDiscoveryWriter +from ray._private.ray_constants import ( + DEBUG_AUTOSCALING_ERROR, + DEBUG_AUTOSCALING_STATUS, + DEBUG_AUTOSCALING_STATUS_LEGACY, + GLOBAL_GRPC_OPTIONS, + KV_NAMESPACE_CLUSTER, + KV_NAMESPACE_DASHBOARD, + env_integer, +) +import ray.dashboard.consts as dashboard_consts +from ray._private.usage.usage_constants import CLUSTER_METADATA_KEY +from ray._private.utils import init_grpc_channel +from ray.autoscaler._private.commands import debug_status +from ray.core.generated import reporter_pb2, reporter_pb2_grpc +from ray.dashboard.consts import GCS_RPC_TIMEOUT_SECONDS +from ray.dashboard.state_aggregator import StateAPIManager +from ray.util.state.common import ListApiOptions +from ray.util.state.state_manager import StateDataSourceClient + +logger = logging.getLogger(__name__) +routes = dashboard_optional_utils.DashboardHeadRouteTable + +EMOJI_WARNING = "⚠️" +WARNING_FOR_MULTI_TASK_IN_A_WORKER = ( + "Warning: This task is running in a worker process that is running multiple tasks. " + "This can happen if you are profiling a task right as it finishes or if you" + "are using the Async Actor or Threaded Actors pattern. " + "The information that follows may come from any of these tasks:" +) +SVG_STYLE = """<style> + svg { + width: 100%; + height: 100%; + } +</style>\n""" + +# NOTE: Executor in this head is intentionally constrained to just 1 thread by +# default to limit its concurrency, therefore reducing potential for +# GIL contention +RAY_DASHBOARD_REPORTER_HEAD_TPE_MAX_WORKERS = env_integer( + "RAY_DASHBOARD_REPORTER_HEAD_TPE_MAX_WORKERS", 1 +) + + +class ReportHead(dashboard_utils.DashboardHeadModule): + def __init__(self, config: dashboard_utils.DashboardHeadModuleConfig): + super().__init__(config) + self._ray_config = None + # TODO(fyrestone): Avoid using ray.state in dashboard, it's not + # asynchronous and will lead to low performance. ray disconnect() + # will be hang when the ray.state is connected and the GCS is exit. + # Please refer to: https://github.com/ray-project/ray/issues/16328 + self.service_discovery = PrometheusServiceDiscoveryWriter( + self.gcs_address, self.temp_dir + ) + self._state_api = None + self._executor = ThreadPoolExecutor( + max_workers=RAY_DASHBOARD_REPORTER_HEAD_TPE_MAX_WORKERS, + thread_name_prefix="reporter_head_executor", + ) + + # Fetched from GCS only once on startup in run(). It's static throughout the + # the cluster's lifetime. + self.cluster_metadata = None + + @routes.get("/api/v0/cluster_metadata") + async def get_cluster_metadata(self, req): + return dashboard_optional_utils.rest_response( + success=True, message="", **self.cluster_metadata + ) + + @routes.get("/api/cluster_status") + async def get_cluster_status(self, req): + """Returns status information about the cluster. + + Currently contains two fields: + autoscaling_status (str)-- a status message from the autoscaler. + autoscaling_error (str)-- an error message from the autoscaler if + anything has gone wrong during autoscaling. + + These fields are both read from the GCS, it's expected that the + autoscaler writes them there. + """ + # TODO(rickyx): We should be able to get the cluster status from the + # autoscaler directly with V2. And we should be able to return structured data + # rather than a string. + + return_formatted_output = req.query.get("format", "0") == "1" + + (legacy_status, formatted_status_string, error) = await asyncio.gather( + *[ + self.gcs_aio_client.internal_kv_get( + key.encode(), namespace=None, timeout=GCS_RPC_TIMEOUT_SECONDS + ) + for key in [ + DEBUG_AUTOSCALING_STATUS_LEGACY, + DEBUG_AUTOSCALING_STATUS, + DEBUG_AUTOSCALING_ERROR, + ] + ] + ) + + formatted_status = ( + json.loads(formatted_status_string.decode()) + if formatted_status_string + else {} + ) + + if not return_formatted_output: + return dashboard_optional_utils.rest_response( + success=True, + message="Got cluster status.", + autoscaling_status=legacy_status.decode() if legacy_status else None, + autoscaling_error=error.decode() if error else None, + cluster_status=formatted_status if formatted_status else None, + ) + else: + return dashboard_optional_utils.rest_response( + success=True, + message="Got formatted cluster status.", + cluster_status=debug_status( + formatted_status_string, error, address=self.gcs_address + ), + ) + + async def get_task_ids_running_in_a_worker(self, worker_id: str) -> List[str]: + """ + Retrieves the task IDs of running tasks associated with a specific worker. + + Args: + worker_id: The ID of the worker. + + Returns: + List[str]: A list containing the task IDs + of all the running tasks associated with the worker. + """ + option = ListApiOptions( + filters=[("worker_id", "=", worker_id), ("state", "=", "RUNNING")], + detail=True, + timeout=10, + ) + # Call the state API to get all tasks in a worker + tasks_in_a_worker_result = await self._state_api.list_tasks(option=option) + tasks_in_a_worker = tasks_in_a_worker_result.result + + # Get task_id from each task in a worker + task_ids_in_a_worker = [ + task.get("task_id") + for task in tasks_in_a_worker + if task and "task_id" in task + ] + return task_ids_in_a_worker + + async def get_worker_details_for_running_task( + self, task_id: str, attempt_number: int + ) -> Tuple[Optional[int], Optional[str]]: + """ + Retrieves worker details for a specific task and attempt number. + + Args: + task_id: The ID of the task. + attempt_number: The attempt number of the task. + + Returns: + Tuple[Optional[int], Optional[str]]: A tuple + containing the worker's PID (process ID), + and worker's ID. + + Raises: + ValueError: If the task attempt is not running or + the state APi is not initialized. + """ + if self._state_api is None: + raise ValueError("The state API is not initialized yet. Please retry.") + option = ListApiOptions( + filters=[ + ("task_id", "=", task_id), + ("attempt_number", "=", attempt_number), + ], + detail=True, + timeout=10, + ) + + result = await self._state_api.list_tasks(option=option) + tasks = result.result + if not tasks: + return None, None + + pid = tasks[0]["worker_pid"] + worker_id = tasks[0]["worker_id"] + + state = tasks[0]["state"] + if state != "RUNNING": + raise ValueError( + f"The task attempt is not running: the current state is {state}." + ) + + return pid, worker_id + + @routes.get("/task/traceback") + async def get_task_traceback(self, req) -> aiohttp.web.Response: + """ + Retrieves the traceback information for a specific task. + Note that one worker process works on one task at a time + or one worker works on multiple async tasks. + + Args: + req (aiohttp.web.Request): The HTTP request object. + + Params: + task_id: The ID of the task. + attempt_number: The attempt number of the task. + node_id: The ID of the node. + + Returns: + aiohttp.web.Response: The HTTP response containing + the traceback information. + + Raises: + ValueError: If the "task_id" parameter + is missing in the request query. + ValueError: If the "attempt_number" parameter + is missing in the request query. + ValueError: If the worker begins working on + another task during the traceback retrieval. + aiohttp.web.HTTPInternalServerError: If there is + an internal server error during the traceback retrieval. + """ + + if "task_id" not in req.query: + raise ValueError("task_id is required") + if "attempt_number" not in req.query: + raise ValueError("task's attempt number is required") + if "node_id" not in req.query: + raise ValueError("node_id is required") + + task_id = req.query.get("task_id") + attempt_number = req.query.get("attempt_number") + node_id_hex = req.query.get("node_id") + + addrs = await self._get_stub_address_by_node_id(NodeID.from_hex(node_id_hex)) + if not addrs: + raise aiohttp.web.HTTPInternalServerError( + text=f"Failed to get agent address for node {node_id_hex}" + ) + node_id, ip, http_port, grpc_port = addrs + reporter_stub = self._make_stub(f"{ip}:{grpc_port}") + + # Default not using `--native` for profiling + native = req.query.get("native", False) == "1" + + try: + (pid, _) = await self.get_worker_details_for_running_task( + task_id, attempt_number + ) + except ValueError as e: + raise aiohttp.web.HTTPInternalServerError(text=str(e)) + + logger.info( + "Sending stack trace request to {}:{} with native={}".format( + ip, pid, native + ) + ) + reply = await reporter_stub.GetTraceback( + reporter_pb2.GetTracebackRequest(pid=pid, native=native) + ) + + """ + In order to truly confirm whether there are any other tasks + running during the profiling, we need to retrieve all tasks + that are currently running or have finished, and then parse + the task events (i.e., their start and finish times) to check + for any potential overlap. However, this process can be + quite extensive, so here we will make our best efforts + to check for any overlapping tasks. + Therefore, we will check if the task is still running + """ + try: + (_, worker_id) = await self.get_worker_details_for_running_task( + task_id, attempt_number + ) + + except ValueError as e: + raise aiohttp.web.HTTPInternalServerError(text=str(e)) + if not reply.success: + return aiohttp.web.HTTPInternalServerError(text=reply.output) + + logger.info("Returning stack trace, size {}".format(len(reply.output))) + + task_ids_in_a_worker = await self.get_task_ids_running_in_a_worker(worker_id) + return aiohttp.web.Response( + text=WARNING_FOR_MULTI_TASK_IN_A_WORKER + + str(task_ids_in_a_worker) + + "\n" + + reply.output + if len(task_ids_in_a_worker) > 1 + else reply.output + ) + + @routes.get("/task/cpu_profile") + async def get_task_cpu_profile(self, req) -> aiohttp.web.Response: + """ + Retrieves the CPU profile for a specific task. + Note that one worker process works on one task at a time + or one worker works on multiple async tasks. + + Args: + req (aiohttp.web.Request): The HTTP request object. + + Returns: + aiohttp.web.Response: The HTTP response containing the CPU profile data. + + Raises: + ValueError: If the "task_id" parameter is + missing in the request query. + ValueError: If the "attempt_number" parameter is + missing in the request query. + ValueError: If the maximum duration allowed is exceeded. + ValueError: If the worker begins working on + another task during the profile retrieval. + aiohttp.web.HTTPInternalServerError: If there is + an internal server error during the profile retrieval. + aiohttp.web.HTTPInternalServerError: If the CPU Flame + Graph information for the task is not found. + """ + if "task_id" not in req.query: + raise ValueError("task_id is required") + if "attempt_number" not in req.query: + raise ValueError("task's attempt number is required") + if "node_id" not in req.query: + raise ValueError("node_id is required") + + task_id = req.query.get("task_id") + attempt_number = req.query.get("attempt_number") + node_id_hex = req.query.get("node_id") + + duration_s = int(req.query.get("duration", 5)) + if duration_s > 60: + raise ValueError(f"The max duration allowed is 60 seconds: {duration_s}.") + format = req.query.get("format", "flamegraph") + + # Default not using `--native` for profiling + native = req.query.get("native", False) == "1" + addrs = await self._get_stub_address_by_node_id(NodeID.from_hex(node_id_hex)) + if not addrs: + raise aiohttp.web.HTTPInternalServerError( + text=f"Failed to get agent address for node {node_id_hex}" + ) + node_id, ip, http_port, grpc_port = addrs + reporter_stub = self._make_stub(f"{ip}:{grpc_port}") + + try: + (pid, _) = await self.get_worker_details_for_running_task( + task_id, attempt_number + ) + except ValueError as e: + raise aiohttp.web.HTTPInternalServerError(text=str(e)) + + logger.info( + f"Sending CPU profiling request to {ip}:{grpc_port}, pid {pid}, for {task_id} with native={native}" + ) + + reply = await reporter_stub.CpuProfiling( + reporter_pb2.CpuProfilingRequest( + pid=pid, duration=duration_s, format=format, native=native + ) + ) + + """ + In order to truly confirm whether there are any other tasks + running during the profiling, we need to retrieve all tasks + that are currently running or have finished, and then parse + the task events (i.e., their start and finish times) to check + for any potential overlap. However, this process can be quite + extensive, so here we will make our best efforts to check + for any overlapping tasks. Therefore, we will check if + the task is still running + """ + try: + (_, worker_id) = await self.get_worker_details_for_running_task( + task_id, attempt_number + ) + except ValueError as e: + raise aiohttp.web.HTTPInternalServerError(text=str(e)) + + if not reply.success: + return aiohttp.web.HTTPInternalServerError(text=reply.output) + logger.info("Returning profiling response, size {}".format(len(reply.output))) + + task_ids_in_a_worker = await self.get_task_ids_running_in_a_worker(worker_id) + return aiohttp.web.Response( + body='<p style="color: #E37400;">{} {} </br> </p> </br>'.format( + EMOJI_WARNING, + WARNING_FOR_MULTI_TASK_IN_A_WORKER + str(task_ids_in_a_worker), + ) + + SVG_STYLE + + (reply.output) + if len(task_ids_in_a_worker) > 1 + else SVG_STYLE + reply.output, + headers={"Content-Type": "text/html"}, + ) + + @routes.get("/worker/traceback") + async def get_traceback(self, req) -> aiohttp.web.Response: + """ + Params: + pid: Required. The PID of the worker. + ip: Required. The IP address of the node. + + """ + pid = req.query.get("pid") + ip = req.query.get("ip") + if not pid: + raise ValueError("pid is required") + if not ip: + raise ValueError("ip is required") + + addrs = await self._get_stub_address_by_ip(ip) + if not addrs: + raise aiohttp.web.HTTPInternalServerError( + text=f"Failed to get agent address for node at IP {ip}" + ) + node_id, ip, http_port, grpc_port = addrs + reporter_stub = self._make_stub(f"{ip}:{grpc_port}") + # Default not using `--native` for profiling + native = req.query.get("native", False) == "1" + logger.info( + f"Sending stack trace request to {ip}:{grpc_port}, pid {pid}, with native={native}" + ) + pid = int(pid) + reply = await reporter_stub.GetTraceback( + reporter_pb2.GetTracebackRequest(pid=pid, native=native) + ) + if reply.success: + logger.info("Returning stack trace, size {}".format(len(reply.output))) + return aiohttp.web.Response(text=reply.output) + else: + return aiohttp.web.HTTPInternalServerError(text=reply.output) + + @routes.get("/worker/cpu_profile") + async def cpu_profile(self, req) -> aiohttp.web.Response: + """ + Params: + pid: Required. The PID of the worker. + ip: Required. The IP address of the node. + """ + pid = req.query.get("pid") + ip = req.query.get("ip") + if not pid: + raise ValueError("pid is required") + if not ip: + raise ValueError("ip is required") + + addrs = await self._get_stub_address_by_ip(ip) + if not addrs: + raise aiohttp.web.HTTPInternalServerError( + text=f"Failed to get agent address for node at IP {ip}" + ) + node_id, ip, http_port, grpc_port = addrs + reporter_stub = self._make_stub(f"{ip}:{grpc_port}") + + pid = int(pid) + duration_s = int(req.query.get("duration", 5)) + if duration_s > 60: + raise ValueError(f"The max duration allowed is 60 seconds: {duration_s}.") + format = req.query.get("format", "flamegraph") + + # Default not using `--native` for profiling + native = req.query.get("native", False) == "1" + logger.info( + f"Sending CPU profiling request to {ip}:{grpc_port}, pid {pid}, with native={native}" + ) + reply = await reporter_stub.CpuProfiling( + reporter_pb2.CpuProfilingRequest( + pid=pid, duration=duration_s, format=format, native=native + ) + ) + if reply.success: + logger.info( + "Returning profiling response, size {}".format(len(reply.output)) + ) + return aiohttp.web.Response( + body=reply.output, + headers={ + "Content-Type": "image/svg+xml" + if format == "flamegraph" + else "text/plain" + }, + ) + else: + return aiohttp.web.HTTPInternalServerError(text=reply.output) + + @routes.get("/memory_profile") + async def memory_profile(self, req) -> aiohttp.web.Response: + """ + Retrieves the memory profile for a specific worker or task. + Note that for tasks, one worker process works on one task at a time + or one worker works on multiple async tasks. + + Args: + req (aiohttp.web.Request): The HTTP request object. + + Returns: + aiohttp.web.Response: The HTTP response containing the memory profile data. + + Params (1): + pid: The PID of the worker. + ip: The IP address of the node. + Params (2): + task_id: The ID of the task. + attempt_number: The attempt number of the task. + node_id: The ID of the node. + + Raises: + aiohttp.web.HTTPInternalServerError: If no stub + found from the given IP value + aiohttp.web.HTTPInternalServerError: If the + "task_id" parameter exists but either "attempt_number" + or "node id" is missing in the request query. + aiohttp.web.HTTPInternalServerError: If the maximum + duration allowed is exceeded. + aiohttp.web.HTTPInternalServerError If requesting task + profiling for the worker begins working on another task + during the profile retrieval. + aiohttp.web.HTTPInternalServerError: If there is + an internal server error during the profile retrieval. + """ + is_task = "task_id" in req.query + + # Either is_task or not, we need to get ip and grpc_port. + if is_task: + if "attempt_number" not in req.query: + return aiohttp.web.HTTPInternalServerError( + text=( + "Failed to execute task profiling: " + "task's attempt number is required" + ) + ) + if "node_id" not in req.query: + return aiohttp.web.HTTPInternalServerError( + text=( + "Failed to execute task profiling: " + "task's node id is required" + ) + ) + + task_id = req.query.get("task_id") + attempt_number = req.query.get("attempt_number") + try: + (pid, _) = await self.get_worker_details_for_running_task( + task_id, attempt_number + ) + except ValueError as e: + raise aiohttp.web.HTTPInternalServerError(text=str(e)) + node_id_hex = req.query.get("node_id") + addrs = await self._get_stub_address_by_node_id( + NodeID.from_hex(node_id_hex) + ) + if not addrs: + return aiohttp.web.HTTPInternalServerError( + text=f"Failed to execute: no agent address found for node {node_id_hex}" + ) + _, ip, _, grpc_port = addrs + else: + pid = int(req.query["pid"]) + ip = req.query.get("ip") + addrs = await self._get_stub_address_by_ip(ip) + if not addrs: + return aiohttp.web.HTTPInternalServerError( + text=f"Failed to execute: no agent address found for node IP {ip}" + ) + _, ip, _, grpc_port = addrs + + assert pid is not None + ip_port = f"{ip}:{grpc_port}" + + duration_s = int(req.query.get("duration", 10)) + + # Default not using `--native`, `--leaks` and `--format` for profiling + format = req.query.get("format", "flamegraph") + native = req.query.get("native", False) == "1" + leaks = req.query.get("leaks", False) == "1" + trace_python_allocators = req.query.get("trace_python_allocators", False) == "1" + + reporter_stub = self._make_stub(ip_port) + + logger.info( + f"Retrieving memory profiling request to {ip}:{grpc_port}, pid {pid}, with native={native}" + ) + + reply = await reporter_stub.MemoryProfiling( + reporter_pb2.MemoryProfilingRequest( + pid=pid, + format=format, + leaks=leaks, + duration=duration_s, + native=native, + trace_python_allocators=trace_python_allocators, + ) + ) + + task_ids_in_a_worker = None + warning = reply.warning if reply.warning else "" + if is_task: + """ + In order to truly confirm whether there are any other tasks + running during the profiling, Ray needs to retrieve all tasks + that are currently running or have finished, and then parse + the task events (i.e., their start and finish times) to check + for any potential overlap. However, this process can be quite + extensive, so Ray makes our best efforts to check + for any overlapping tasks. Therefore, Ray checks if + the task is still running. + """ + try: + (_, worker_id) = await self.get_worker_details_for_running_task( + task_id, attempt_number + ) + except ValueError as e: + raise aiohttp.web.HTTPInternalServerError(text=str(e)) + + task_ids_in_a_worker = await self.get_task_ids_running_in_a_worker( + worker_id + ) + if len(task_ids_in_a_worker) > 1: + warning += ( + "\n" + + WARNING_FOR_MULTI_TASK_IN_A_WORKER + + str(task_ids_in_a_worker) + ) + + if not reply.success: + return aiohttp.web.HTTPInternalServerError(text=reply.output) + logger.info("Returning profiling response, size {}".format(len(reply.output))) + + return aiohttp.web.Response( + body='<p style="color: #E37400;">{} {} </br> </p> </br>'.format( + EMOJI_WARNING, warning + ) + + (reply.output) + if warning != "" + else reply.output, + headers={"Content-Type": "text/html"}, + ) + + async def _get_stub_address_by_node_id( + self, node_id: NodeID + ) -> Optional[Tuple[NodeID, str, int, int]]: + """ + Given a NodeID, get agent port from InternalKV. + + returns a tuple of (ip, http_port, grpc_port). + + If not found, return None. + """ + agent_addr_json = await self.gcs_aio_client.internal_kv_get( + f"{dashboard_consts.DASHBOARD_AGENT_ADDR_NODE_ID_PREFIX}{node_id.hex()}".encode(), + namespace=KV_NAMESPACE_DASHBOARD, + timeout=GCS_RPC_TIMEOUT_SECONDS, + ) + if not agent_addr_json: + return None + ip, http_port, grpc_port = json.loads(agent_addr_json) + return node_id, ip, http_port, grpc_port + + async def _get_stub_address_by_ip( + self, ip: str + ) -> Optional[Tuple[NodeID, str, int, int]]: + agent_addr_json = await self.gcs_aio_client.internal_kv_get( + f"{dashboard_consts.DASHBOARD_AGENT_ADDR_IP_PREFIX}{ip}".encode(), + namespace=KV_NAMESPACE_DASHBOARD, + timeout=GCS_RPC_TIMEOUT_SECONDS, + ) + if not agent_addr_json: + return None + node_id, http_port, grpc_port = json.loads(agent_addr_json) + return node_id, ip, http_port, grpc_port + + def _make_stub( + self, ip_port: str + ) -> Optional[reporter_pb2_grpc.ReporterServiceStub]: + options = GLOBAL_GRPC_OPTIONS + channel = init_grpc_channel(ip_port, options=options, asynchronous=True) + return reporter_pb2_grpc.ReporterServiceStub(channel) + + async def run(self, server): + self._state_api_data_source_client = StateDataSourceClient( + self.aiogrpc_gcs_channel, self.gcs_aio_client + ) + # Set up the state API in order to fetch task information. + # This is only used to get task info. If we have Task APIs in GcsClient we can + # remove this. + # TODO(ryw): unify the StateAPIManager in reporter_head and state_head. + self._state_api = StateAPIManager( + self._state_api_data_source_client, + self._executor, + ) + + # Need daemon True to avoid dashboard hangs at exit. + self.service_discovery.daemon = True + self.service_discovery.start() + + cluster_metadata = await self.gcs_aio_client.internal_kv_get( + CLUSTER_METADATA_KEY, + namespace=KV_NAMESPACE_CLUSTER, + ) + self.cluster_metadata = json.loads(cluster_metadata.decode("utf-8")) + + @staticmethod + def is_minimal_module(): + return False diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/modules/state/__init__.py b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/state/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/modules/state/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/state/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1688016b53a0d2dbdf9794e6806ec7c74d01d677 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/state/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/modules/state/__pycache__/state_head.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/state/__pycache__/state_head.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..dc251693c0fb7c20753c6949db1dfaea75d272fa Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/state/__pycache__/state_head.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/modules/state/state_head.py b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/state/state_head.py new file mode 100644 index 0000000000000000000000000000000000000000..824fe3026525167cd787d43b6b0c54855fad0155 --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/state/state_head.py @@ -0,0 +1,396 @@ +import asyncio +import logging +from concurrent.futures import ThreadPoolExecutor +from dataclasses import asdict +from datetime import datetime +from typing import AsyncIterable, Optional + +import aiohttp.web +from aiohttp.web import Response + +import ray.dashboard.optional_utils as dashboard_optional_utils +import ray.dashboard.utils as dashboard_utils +from ray import ActorID +from ray._private.ray_constants import env_integer +from ray._private.usage.usage_lib import TagKey, record_extra_usage_tag +from ray.core.generated.gcs_pb2 import ActorTableData +from ray.dashboard.consts import ( + RAY_STATE_SERVER_MAX_HTTP_REQUEST, + RAY_STATE_SERVER_MAX_HTTP_REQUEST_ALLOWED, + RAY_STATE_SERVER_MAX_HTTP_REQUEST_ENV_NAME, +) +from ray.dashboard.datacenter import DataSource +from ray.dashboard.modules.log.log_manager import LogsManager +from ray.dashboard.state_aggregator import StateAPIManager +from ray.dashboard.state_api_utils import ( + do_reply, + handle_list_api, + handle_summary_api, + options_from_req, +) +from ray.dashboard.utils import Change, RateLimitedModule +from ray.util.state.common import DEFAULT_LOG_LIMIT, DEFAULT_RPC_TIMEOUT, GetLogOptions +from ray.util.state.exception import DataSourceUnavailable +from ray.util.state.state_manager import StateDataSourceClient + +logger = logging.getLogger(__name__) +routes = dashboard_optional_utils.DashboardHeadRouteTable + +# NOTE: Executor in this head is intentionally constrained to just 1 thread by +# default to limit its concurrency, therefore reducing potential for +# GIL contention +RAY_DASHBOARD_STATE_HEAD_TPE_MAX_WORKERS = env_integer( + "RAY_DASHBOARD_STATE_HEAD_TPE_MAX_WORKERS", 1 +) + + +class StateHead(dashboard_utils.DashboardHeadModule, RateLimitedModule): + """Module to obtain state information from the Ray cluster. + + It is responsible for state observability APIs such as + ray.list_actors(), ray.get_actor(), ray.summary_actors(). + """ + + def __init__( + self, + config: dashboard_utils.DashboardHeadModuleConfig, + ): + """Initialize for handling RESTful requests from State API Client""" + dashboard_utils.DashboardHeadModule.__init__(self, config) + # We don't allow users to configure too high a rate limit + RateLimitedModule.__init__( + self, + min( + RAY_STATE_SERVER_MAX_HTTP_REQUEST, + RAY_STATE_SERVER_MAX_HTTP_REQUEST_ALLOWED, + ), + ) + self._state_api_data_source_client = None + self._state_api = None + self._log_api = None + + self._executor = ThreadPoolExecutor( + max_workers=RAY_DASHBOARD_STATE_HEAD_TPE_MAX_WORKERS, + thread_name_prefix="state_head_executor", + ) + + DataSource.nodes.signal.append(self._update_raylet_stubs) + DataSource.agents.signal.append(self._update_agent_stubs) + + async def limit_handler_(self): + return do_reply( + success=False, + error_message=( + "Max number of in-progress requests=" + f"{self.max_num_call_} reached. " + "To set a higher limit, set environment variable: " + f"export {RAY_STATE_SERVER_MAX_HTTP_REQUEST_ENV_NAME}='xxx'. " + f"Max allowed = {RAY_STATE_SERVER_MAX_HTTP_REQUEST_ALLOWED}" + ), + result=None, + ) + + async def _update_raylet_stubs(self, change: Change): + """Callback that's called when a new raylet is added to Datasource. + + Datasource is a api-server-specific module that's updated whenever + api server adds/removes a new node. + + Args: + change: The change object. Whenever a new node is added + or removed, this callback is invoked. + When new node is added: information is in `change.new`. + When a node is removed: information is in `change.old`. + When a node id is overwritten by a new node with the same node id: + `change.old` contains the old node info, and + `change.new` contains the new node info. + """ + if change.old: + # When a node is deleted from the DataSource or it is overwritten. + node_id, node_info = change.old + self._state_api_data_source_client.unregister_raylet_client(node_id) + if change.new: + # When a new node information is written to DataSource. + node_id, node_info = change.new + self._state_api_data_source_client.register_raylet_client( + node_id, + node_info["nodeManagerAddress"], + int(node_info["nodeManagerPort"]), + int(node_info["runtimeEnvAgentPort"]), + ) + + async def _update_agent_stubs(self, change: Change): + """Callback that's called when a new agent is added to Datasource.""" + if change.old: + node_id, _ = change.old + self._state_api_data_source_client.unregister_agent_client(node_id) + if change.new: + # When a new node information is written to DataSource. + node_id, (node_ip, http_port, grpc_port) = change.new + self._state_api_data_source_client.register_agent_client( + node_id, + node_ip, + grpc_port, + ) + + @routes.get("/api/v0/actors") + @RateLimitedModule.enforce_max_concurrent_calls + async def list_actors(self, req: aiohttp.web.Request) -> aiohttp.web.Response: + record_extra_usage_tag(TagKey.CORE_STATE_API_LIST_ACTORS, "1") + return await handle_list_api(self._state_api.list_actors, req) + + @routes.get("/api/v0/jobs") + @RateLimitedModule.enforce_max_concurrent_calls + async def list_jobs(self, req: aiohttp.web.Request) -> aiohttp.web.Response: + record_extra_usage_tag(TagKey.CORE_STATE_API_LIST_JOBS, "1") + try: + result = await self._state_api.list_jobs(option=options_from_req(req)) + return do_reply( + success=True, + error_message="", + result=asdict(result), + ) + except DataSourceUnavailable as e: + return do_reply(success=False, error_message=str(e), result=None) + + @routes.get("/api/v0/nodes") + @RateLimitedModule.enforce_max_concurrent_calls + async def list_nodes(self, req: aiohttp.web.Request) -> aiohttp.web.Response: + record_extra_usage_tag(TagKey.CORE_STATE_API_LIST_NODES, "1") + return await handle_list_api(self._state_api.list_nodes, req) + + @routes.get("/api/v0/placement_groups") + @RateLimitedModule.enforce_max_concurrent_calls + async def list_placement_groups( + self, req: aiohttp.web.Request + ) -> aiohttp.web.Response: + record_extra_usage_tag(TagKey.CORE_STATE_API_LIST_PLACEMENT_GROUPS, "1") + return await handle_list_api(self._state_api.list_placement_groups, req) + + @routes.get("/api/v0/workers") + @RateLimitedModule.enforce_max_concurrent_calls + async def list_workers(self, req: aiohttp.web.Request) -> aiohttp.web.Response: + record_extra_usage_tag(TagKey.CORE_STATE_API_LIST_WORKERS, "1") + return await handle_list_api(self._state_api.list_workers, req) + + @routes.get("/api/v0/tasks") + @RateLimitedModule.enforce_max_concurrent_calls + async def list_tasks(self, req: aiohttp.web.Request) -> aiohttp.web.Response: + record_extra_usage_tag(TagKey.CORE_STATE_API_LIST_TASKS, "1") + return await handle_list_api(self._state_api.list_tasks, req) + + @routes.get("/api/v0/objects") + @RateLimitedModule.enforce_max_concurrent_calls + async def list_objects(self, req: aiohttp.web.Request) -> aiohttp.web.Response: + record_extra_usage_tag(TagKey.CORE_STATE_API_LIST_OBJECTS, "1") + return await handle_list_api(self._state_api.list_objects, req) + + @routes.get("/api/v0/runtime_envs") + @RateLimitedModule.enforce_max_concurrent_calls + async def list_runtime_envs(self, req: aiohttp.web.Request) -> aiohttp.web.Response: + record_extra_usage_tag(TagKey.CORE_STATE_API_LIST_RUNTIME_ENVS, "1") + return await handle_list_api(self._state_api.list_runtime_envs, req) + + @routes.get("/api/v0/logs") + @RateLimitedModule.enforce_max_concurrent_calls + async def list_logs(self, req: aiohttp.web.Request) -> aiohttp.web.Response: + """Return a list of log files on a given node id. + + Unlike other list APIs that display all existing resources in the cluster, + this API always require to specify node id and node ip. + """ + record_extra_usage_tag(TagKey.CORE_STATE_API_LIST_LOGS, "1") + glob_filter = req.query.get("glob", "*") + node_id = req.query.get("node_id", None) + node_ip = req.query.get("node_ip", None) + timeout = int(req.query.get("timeout", DEFAULT_RPC_TIMEOUT)) + + if not node_id and not node_ip: + return do_reply( + success=False, + error_message=( + "Both node id and node ip are not provided. " + "Please provide at least one of them." + ), + result=None, + ) + + node_id = node_id or self._log_api.ip_to_node_id(node_ip) + if not node_id: + return do_reply( + success=False, + error_message=( + f"Cannot find matching node_id for a given node ip {node_ip}" + ), + result=None, + ) + + try: + result = await self._log_api.list_logs( + node_id, timeout, glob_filter=glob_filter + ) + except DataSourceUnavailable as e: + return do_reply( + success=False, + error_message=str(e), + result=None, + ) + + return do_reply(success=True, error_message="", result=result) + + @routes.get("/api/v0/logs/{media_type}") + @RateLimitedModule.enforce_max_concurrent_calls + async def get_logs(self, req: aiohttp.web.Request): + """ + Fetches logs from the given criteria. + + Output format is from the query parameter `format`. + - `leading_1` (default): Each chunk of data is prepended with a char `1` if the + chunk is successful, or `0` if the chunk is failed. After a `0` and its + error message, the stream is closed. + - `text`: Plain text format. Returns the original log data as-is. If an + exception occurs, yields `[get_logs] Fetch log error` with error message and + closes the stream. + + Note: all formats always return 200 even if the log fetching fails. + """ + record_extra_usage_tag(TagKey.CORE_STATE_API_GET_LOG, "1") + options = GetLogOptions( + timeout=int(req.query.get("timeout", DEFAULT_RPC_TIMEOUT)), + node_id=req.query.get("node_id", None), + node_ip=req.query.get("node_ip", None), + media_type=req.match_info.get("media_type", "file"), + filename=req.query.get("filename", None), + actor_id=req.query.get("actor_id", None), + task_id=req.query.get("task_id", None), + submission_id=req.query.get("submission_id", None), + pid=req.query.get("pid", None), + lines=req.query.get("lines", DEFAULT_LOG_LIMIT), + interval=req.query.get("interval", None), + suffix=req.query.get("suffix", "out"), + attempt_number=req.query.get("attempt_number", 0), + ) + + output_format = req.query.get("format", "leading_1") + logger.info(f"Streaming logs with format {output_format} options: {options}") + + async def get_actor_fn(actor_id: ActorID) -> Optional[ActorTableData]: + actor_info_dict = await self.gcs_aio_client.get_all_actor_info( + actor_id=actor_id + ) + if len(actor_info_dict) == 0: + return None + return actor_info_dict[actor_id] + + async def formatter_text(response, async_gen: AsyncIterable[bytes]): + try: + async for logs in async_gen: + await response.write(logs) + except asyncio.CancelledError: + # This happens when the client side closes the connection. + # Force close the connection and do no-op. + response.force_close() + raise + except Exception as e: + logger.exception("Error while streaming logs") + await response.write(f"[get_logs] Fetch log error: {e}".encode()) + + async def formatter_leading_1(response, async_gen: AsyncIterable[bytes]): + # NOTE: The first byte indicates the success / failure of individual + # stream. If the first byte is b"1", it means the stream was successful. + # If it is b"0", it means it is failed. + try: + async for logs in async_gen: + logs_to_stream = bytearray(b"1") + logs_to_stream.extend(logs) + await response.write(bytes(logs_to_stream)) + except asyncio.CancelledError: + # This happens when the client side closes the connection. + # Fofce close the connection and do no-op. + response.force_close() + raise + except Exception as e: + logger.exception("Error while streaming logs") + error_msg = bytearray(b"0") + error_msg.extend( + f"Closing HTTP stream due to internal server error.\n{e}".encode() + ) + await response.write(bytes(error_msg)) + + response = aiohttp.web.StreamResponse() + response.content_type = "text/plain" + await response.prepare(req) + + logs_gen = self._log_api.stream_logs(options, get_actor_fn) + if output_format == "text": + await formatter_text(response, logs_gen) + elif output_format == "leading_1": + await formatter_leading_1(response, logs_gen) + else: + raise ValueError( + f"Unsupported format: {output_format}, use 'text' or " "'leading_1'" + ) + await response.write_eof() + return response + + @routes.get("/api/v0/tasks/summarize") + @RateLimitedModule.enforce_max_concurrent_calls + async def summarize_tasks(self, req: aiohttp.web.Request) -> aiohttp.web.Response: + record_extra_usage_tag(TagKey.CORE_STATE_API_SUMMARIZE_TASKS, "1") + return await handle_summary_api(self._state_api.summarize_tasks, req) + + @routes.get("/api/v0/actors/summarize") + @RateLimitedModule.enforce_max_concurrent_calls + async def summarize_actors(self, req: aiohttp.web.Request) -> aiohttp.web.Response: + record_extra_usage_tag(TagKey.CORE_STATE_API_SUMMARIZE_ACTORS, "1") + return await handle_summary_api(self._state_api.summarize_actors, req) + + @routes.get("/api/v0/objects/summarize") + @RateLimitedModule.enforce_max_concurrent_calls + async def summarize_objects(self, req: aiohttp.web.Request) -> aiohttp.web.Response: + record_extra_usage_tag(TagKey.CORE_STATE_API_SUMMARIZE_OBJECTS, "1") + return await handle_summary_api(self._state_api.summarize_objects, req) + + @routes.get("/api/v0/tasks/timeline") + @RateLimitedModule.enforce_max_concurrent_calls + async def tasks_timeline(self, req: aiohttp.web.Request) -> aiohttp.web.Response: + job_id = req.query.get("job_id") + download = req.query.get("download") + result = await self._state_api.generate_task_timeline(job_id) + if download == "1": + # Support download if specified. + now_str = datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + content_disposition = ( + f'attachment; filename="timeline-{job_id}-{now_str}.json"' + ) + headers = {"Content-Disposition": content_disposition} + else: + headers = None + return Response(text=result, content_type="application/json", headers=headers) + + @routes.get("/api/v0/delay/{delay_s}") + async def delayed_response(self, req: aiohttp.web.Request): + """Testing only. Response after a specified delay.""" + delay = int(req.match_info.get("delay_s", 10)) + await asyncio.sleep(delay) + return do_reply( + success=True, + error_message="", + result={}, + partial_failure_warning=None, + ) + + async def run(self, server): + gcs_channel = self.aiogrpc_gcs_channel + self._state_api_data_source_client = StateDataSourceClient( + gcs_channel, self.gcs_aio_client + ) + self._state_api = StateAPIManager( + self._state_api_data_source_client, + self._executor, + ) + self._log_api = LogsManager(self._state_api_data_source_client) + + @staticmethod + def is_minimal_module(): + return False diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/modules/train/__init__.py b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/train/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/modules/train/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/train/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e52ea7426de8def02ae237cbf6cb1a988ca2fc1c Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/train/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/modules/train/__pycache__/train_head.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/train/__pycache__/train_head.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..f34e6b6e1a8b05471c48f661b7b8e66b9c609aa0 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/train/__pycache__/train_head.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/modules/train/train_head.py b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/train/train_head.py new file mode 100644 index 0000000000000000000000000000000000000000..64d59bf776676c3e8471be475612853462a9655d --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/train/train_head.py @@ -0,0 +1,209 @@ +import logging +from typing import List + +from aiohttp.web import Request, Response + +import ray +import ray.dashboard.optional_utils as dashboard_optional_utils +import ray.dashboard.utils as dashboard_utils +from ray.core.generated import gcs_service_pb2_grpc +from ray.dashboard.datacenter import DataOrganizer +from ray.dashboard.modules.job.common import JobInfoStorageClient +from ray.dashboard.modules.job.utils import find_jobs_by_job_ids +from ray.util.annotations import DeveloperAPI + +logger = logging.getLogger(__name__) +logger.setLevel(logging.INFO) + +routes = dashboard_optional_utils.DashboardHeadRouteTable + + +class TrainHead(dashboard_utils.DashboardHeadModule): + def __init__(self, config: dashboard_utils.DashboardHeadModuleConfig): + super().__init__(config) + self._train_stats_actor = None + self._job_info_client = None + self._gcs_actor_info_stub = None + + @routes.get("/api/train/v2/runs") + @dashboard_optional_utils.init_ray_and_catch_exceptions() + @DeveloperAPI + async def get_train_runs(self, req: Request) -> Response: + try: + from ray.train._internal.state.schema import TrainRunsResponse + except ImportError: + logger.exception( + "Train is not installed. Please run `pip install ray[train]` " + "when setting up Ray on your cluster." + ) + return Response( + status=500, + text="Train is not installed. Please run `pip install ray[train]` " + "when setting up Ray on your cluster.", + ) + + stats_actor = await self.get_train_stats_actor() + + if stats_actor is None: + return Response( + status=500, + text=( + "Train state data is not available. Please make sure Ray Train " + "is running and that the Train state actor is enabled by setting " + 'the RAY_TRAIN_ENABLE_STATE_TRACKING environment variable to "1".' + ), + ) + else: + try: + train_runs = await stats_actor.get_all_train_runs.remote() + train_runs_with_details = ( + await self._add_actor_status_and_update_run_status(train_runs) + ) + # Sort train runs in reverse chronological order + train_runs_with_details = sorted( + train_runs_with_details, + key=lambda run: run.start_time_ms, + reverse=True, + ) + job_details = await find_jobs_by_job_ids( + self.gcs_aio_client, + self._job_info_client, + [run.job_id for run in train_runs_with_details], + ) + for run in train_runs_with_details: + run.job_details = job_details.get(run.job_id) + details = TrainRunsResponse(train_runs=train_runs_with_details) + except ray.exceptions.RayTaskError as e: + # Task failure sometimes are due to GCS + # failure. When GCS failed, we expect a longer time + # to recover. + return Response( + status=503, + text=( + "Failed to get a response from the train stats actor. " + f"The GCS may be down, please retry later: {e}" + ), + ) + + return Response( + text=details.json(), + content_type="application/json", + ) + + async def _add_actor_status_and_update_run_status(self, train_runs): + from ray.train._internal.state.schema import ( + ActorStatusEnum, + RunStatusEnum, + TrainRunInfoWithDetails, + TrainWorkerInfoWithDetails, + ) + + train_runs_with_details: List[TrainRunInfoWithDetails] = [] + + for train_run in train_runs.values(): + worker_infos_with_details: List[TrainWorkerInfoWithDetails] = [] + + actor_ids = [worker.actor_id for worker in train_run.workers] + + logger.info(f"Getting all actor info from GCS (actor_ids={actor_ids})") + + train_run_actors = await DataOrganizer.get_actor_infos( + actor_ids=actor_ids, + ) + + for worker_info in train_run.workers: + actor = train_run_actors.get(worker_info.actor_id, None) + # Add hardware metrics to API response + if actor: + gpus = [ + gpu + for gpu in actor["gpus"] + if worker_info.pid + in [process["pid"] for process in gpu["processesPids"]] + ] + # Need to convert processesPids into a proper list. + # It's some weird ImmutableList structureo + # We also convert the list of processes into a single item since + # an actor is only a single process and cannot match multiple + # processes. + formatted_gpus = [ + { + **gpu, + "processInfo": [ + process + for process in gpu["processesPids"] + if process["pid"] == worker_info.pid + ][0], + } + for gpu in gpus + ] + + worker_info_with_details = TrainWorkerInfoWithDetails.parse_obj( + { + **worker_info.dict(), + "status": actor["state"], + "processStats": actor["processStats"], + "gpus": formatted_gpus, + } + ) + else: + worker_info_with_details = TrainWorkerInfoWithDetails.parse_obj( + worker_info.dict() + ) + + worker_infos_with_details.append(worker_info_with_details) + + train_run_with_details = TrainRunInfoWithDetails.parse_obj( + {**train_run.dict(), "workers": worker_infos_with_details} + ) + + # The train run can be unexpectedly terminated before the final run + # status was updated. This could be due to errors outside of the training + # function (e.g., system failure or user interruption) that crashed the + # train controller. + # We need to detect this case and mark the train run as ABORTED. + actor = train_run_actors.get(train_run.controller_actor_id) + controller_actor_status = actor.get("state") if actor else None + if ( + controller_actor_status == ActorStatusEnum.DEAD + and train_run.run_status == RunStatusEnum.RUNNING + ): + train_run_with_details.run_status = RunStatusEnum.ABORTED + train_run_with_details.status_detail = ( + "Terminated due to system errors or killed by the user." + ) + + train_runs_with_details.append(train_run_with_details) + + return train_runs_with_details + + @staticmethod + def is_minimal_module(): + return False + + async def run(self, server): + if not self._job_info_client: + self._job_info_client = JobInfoStorageClient(self.gcs_aio_client) + + gcs_channel = self.aiogrpc_gcs_channel + self._gcs_actor_info_stub = gcs_service_pb2_grpc.ActorInfoGcsServiceStub( + gcs_channel + ) + + async def get_train_stats_actor(self): + """ + Gets the train stats actor and caches it as an instance variable. + """ + try: + from ray.train._internal.state.state_actor import get_state_actor + + if self._train_stats_actor is None: + self._train_stats_actor = get_state_actor() + + return self._train_stats_actor + except ImportError: + logger.exception( + "Train is not installed. Please run `pip install ray[train]` " + "when setting up Ray on your cluster." + ) + return None diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/modules/usage_stats/__init__.py b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/usage_stats/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/modules/usage_stats/__pycache__/__init__.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/usage_stats/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..68242715734f2a6f26a429baa57b2c29fdf0a43e Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/usage_stats/__pycache__/__init__.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/modules/usage_stats/__pycache__/usage_stats_head.cpython-311.pyc b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/usage_stats/__pycache__/usage_stats_head.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..4460e2aa14c1f245aa071259d0b36e38b5896645 Binary files /dev/null and b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/usage_stats/__pycache__/usage_stats_head.cpython-311.pyc differ diff --git a/.venv/lib/python3.11/site-packages/ray/dashboard/modules/usage_stats/usage_stats_head.py b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/usage_stats/usage_stats_head.py new file mode 100644 index 0000000000000000000000000000000000000000..5fdc802d0ebae6cda361f7804c6b480ca7b88d5a --- /dev/null +++ b/.venv/lib/python3.11/site-packages/ray/dashboard/modules/usage_stats/usage_stats_head.py @@ -0,0 +1,214 @@ +import asyncio +import logging +import os +import random +from concurrent.futures import ThreadPoolExecutor + +import requests + +import ray +import ray._private.usage.usage_lib as ray_usage_lib +import ray.dashboard.utils as dashboard_utils +from ray._private.utils import get_or_create_event_loop +from ray.dashboard.utils import async_loop_forever + +logger = logging.getLogger(__name__) + + +class UsageStatsHead(dashboard_utils.DashboardHeadModule): + def __init__(self, config: dashboard_utils.DashboardHeadModuleConfig): + super().__init__(config) + self.usage_stats_enabled = ray_usage_lib.usage_stats_enabled() + self.usage_stats_prompt_enabled = ray_usage_lib.usage_stats_prompt_enabled() + self.cluster_config_to_report = None + self.client = ray_usage_lib.UsageReportClient() + # The total number of report succeeded. + self.total_success = 0 + # The total number of report failed. + self.total_failed = 0 + # The seq number of report. It increments whenever a new report is sent. + self.seq_no = 0 + + self._dashboard_url_base = f"http://{self.http_host}:{self.http_port}" + # We want to record stats for anyone who has run ray with grafana or + # prometheus at any point in time during a ray session. + self._grafana_ran_before = False + self._prometheus_ran_before = False + + if ray._private.utils.check_dashboard_dependencies_installed(): + import aiohttp + + import ray.dashboard.optional_utils + + routes = ray.dashboard.optional_utils.DashboardHeadRouteTable + + @routes.get("/usage_stats_enabled") + async def get_usage_stats_enabled(self, req) -> aiohttp.web.Response: + return ray.dashboard.optional_utils.rest_response( + success=True, + message="Fetched usage stats enabled", + usage_stats_enabled=self.usage_stats_enabled, + usage_stats_prompt_enabled=self.usage_stats_prompt_enabled, + ) + + @routes.get("/cluster_id") + async def get_cluster_id(self, req) -> aiohttp.web.Response: + return ray.dashboard.optional_utils.rest_response( + success=True, + message="Fetched cluster id", + cluster_id=self.gcs_client.cluster_id.hex(), + ) + + def _check_grafana_running(self): + from ray._private.usage.usage_lib import TagKey, record_extra_usage_tag + + if self._grafana_ran_before: + return + + grafana_running = False + try: + resp = requests.get(f"{self._dashboard_url_base}/api/grafana_health") + if resp.status_code == 200: + json = resp.json() + grafana_running = ( + json["result"] is True and json["data"]["grafanaHost"] != "DISABLED" + ) + except Exception: + pass + + record_extra_usage_tag( + TagKey.DASHBOARD_METRICS_GRAFANA_ENABLED, + str(grafana_running), + ) + + if grafana_running: + # Don't need to update the tag ever again + self._grafana_ran_before = True + + def _check_prometheus_running(self): + from ray._private.usage.usage_lib import TagKey, record_extra_usage_tag + + if self._prometheus_ran_before: + return + + prometheus_running = False + try: + resp = requests.get(f"{self._dashboard_url_base}/api/prometheus_health") + if resp.status_code == 200: + json = resp.json() + prometheus_running = json["result"] is True + except Exception: + pass + + record_extra_usage_tag( + TagKey.DASHBOARD_METRICS_PROMETHEUS_ENABLED, + str(prometheus_running), + ) + + if prometheus_running: + # Don't need to update the tag ever again + self._prometheus_ran_before = True + + def _fetch_and_record_extra_usage_stats_data(self): + logger.debug("Recording dashboard metrics extra telemetry data...") + self._check_grafana_running() + self._check_prometheus_running() + + def _report_usage_sync(self): + """ + - Always write usage_stats.json regardless of report success/failure. + - If report fails, the error message should be written to usage_stats.json + - If file write fails, the error will just stay at dashboard.log. + usage_stats.json won't be written. + """ + if not self.usage_stats_enabled: + return + + try: + self._fetch_and_record_extra_usage_stats_data() + + data = ray_usage_lib.generate_report_data( + self.cluster_config_to_report, + self.total_success, + self.total_failed, + self.seq_no, + self.gcs_address, + self.gcs_client.cluster_id.hex(), + ) + + error = None + try: + self.client.report_usage_data( + ray_usage_lib._usage_stats_report_url(), data + ) + except Exception as e: + logger.info(f"Usage report request failed. {e}") + error = str(e) + self.total_failed += 1 + else: + self.total_success += 1 + finally: + self.seq_no += 1 + + data = ray_usage_lib.generate_write_data(data, error) + self.client.write_usage_data(data, self.session_dir) + except Exception as e: + logger.exception(e) + logger.info(f"Usage report failed: {e}") + + async def _report_usage_async(self): + if not self.usage_stats_enabled: + return + + loop = get_or_create_event_loop() + with ThreadPoolExecutor(max_workers=1) as executor: + await loop.run_in_executor(executor, lambda: self._report_usage_sync()) + + def _report_disabled_usage_sync(self): + assert not self.usage_stats_enabled + + try: + if ray_usage_lib.is_ray_init_cluster(self.gcs_client): + return + + data = ray_usage_lib.generate_disabled_report_data() + self.client.report_usage_data(ray_usage_lib._usage_stats_report_url(), data) + except Exception as e: + logger.debug(f"Disabled usage report failed: {e}") + + async def _report_disabled_usage_async(self): + assert not self.usage_stats_enabled + + loop = get_or_create_event_loop() + with ThreadPoolExecutor(max_workers=1) as executor: + await loop.run_in_executor( + executor, lambda: self._report_disabled_usage_sync() + ) + + @async_loop_forever(ray_usage_lib._usage_stats_report_interval_s()) + async def periodically_report_usage(self): + await self._report_usage_async() + + async def run(self, server): + self.cluster_config_to_report = ray_usage_lib.get_cluster_config_to_report( + os.path.expanduser("~/ray_bootstrap_config.yaml") + ) + if not self.usage_stats_enabled: + logger.info("Usage reporting is disabled.") + await self._report_disabled_usage_async() + return + else: + logger.info("Usage reporting is enabled.") + # Wait for 1 minutes to send the first report + # so autoscaler has the chance to set DEBUG_AUTOSCALING_STATUS. + await asyncio.sleep(min(60, ray_usage_lib._usage_stats_report_interval_s())) + await self._report_usage_async() + # Add a random offset before the second report to remove sample bias. + await asyncio.sleep( + random.randint(0, ray_usage_lib._usage_stats_report_interval_s()) + ) + await asyncio.gather(self.periodically_report_usage()) + + @staticmethod + def is_minimal_module(): + return True