peninsula123's picture
feat(main): stream metrics fetch
828386c
Raw
History Blame Contribute Delete
2.21 kB
from dataclasses import dataclass
import httpx
@dataclass(frozen=True)
class RuntimeSnapshot:
prompt_tps: float | None
decode_tps: float | None
requests_processing: int | None
requests_deferred: int | None
active_slots: int
slot_context_tokens: tuple[int, ...]
slot_context_size: int | None
def parse_prometheus_value(text: str, name: str) -> float | None:
prefix = f"{name} "
for line in text.splitlines():
if line.startswith(prefix):
return float(line.removeprefix(prefix))
return None
def fetch_runtime_snapshot(
client: httpx.Client,
metrics_url: str,
slots_url: str,
) -> RuntimeSnapshot:
metrics_response = client.get(metrics_url)
metrics_response.raise_for_status()
slots_response = client.get(slots_url)
slots_response.raise_for_status()
metrics_text = metrics_response.text
slots = slots_response.json()
active_slots = [
slot
for slot in slots
if slot.get("is_processing") is True
]
requests_processing = parse_prometheus_value(
metrics_text,
"llamacpp:requests_processing",
)
requests_deferred = parse_prometheus_value(
metrics_text,
"llamacpp:requests_deferred",
)
return RuntimeSnapshot(
prompt_tps=parse_prometheus_value(
metrics_text,
"llamacpp:prompt_tokens_seconds",
),
decode_tps=parse_prometheus_value(
metrics_text,
"llamacpp:predicted_tokens_seconds",
),
requests_processing=(
int(requests_processing)
if requests_processing is not None
else None
),
requests_deferred=(
int(requests_deferred)
if requests_deferred is not None
else None
),
active_slots=len(active_slots),
slot_context_tokens=tuple(
int(slot["n_prompt_tokens"])
for slot in active_slots
if slot.get("n_prompt_tokens") is not None
),
slot_context_size=(
int(slots[0]["n_ctx"])
if slots and slots[0].get("n_ctx") is not None
else None
),
)