File size: 2,212 Bytes
828386c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
from dataclasses import dataclass
import httpx


@dataclass(frozen=True)
class RuntimeSnapshot:
    prompt_tps: float | None
    decode_tps: float | None
    requests_processing: int | None
    requests_deferred: int | None
    active_slots: int
    slot_context_tokens: tuple[int, ...]
    slot_context_size: int | None


def parse_prometheus_value(text: str, name: str) -> float | None:
    prefix = f"{name} "
    for line in text.splitlines():
        if line.startswith(prefix):
            return float(line.removeprefix(prefix))
    return None

def fetch_runtime_snapshot(
    client: httpx.Client,
    metrics_url: str,
    slots_url: str,
) -> RuntimeSnapshot:
    metrics_response = client.get(metrics_url)
    metrics_response.raise_for_status()

    slots_response = client.get(slots_url)
    slots_response.raise_for_status()

    metrics_text = metrics_response.text
    slots = slots_response.json()

    active_slots = [
        slot
        for slot in slots
        if slot.get("is_processing") is True
    ]

    requests_processing = parse_prometheus_value(
        metrics_text,
        "llamacpp:requests_processing",
    )
    requests_deferred = parse_prometheus_value(
        metrics_text,
        "llamacpp:requests_deferred",
    )

    return RuntimeSnapshot(
        prompt_tps=parse_prometheus_value(
            metrics_text,
            "llamacpp:prompt_tokens_seconds",
        ),
        decode_tps=parse_prometheus_value(
            metrics_text,
            "llamacpp:predicted_tokens_seconds",
        ),
        requests_processing=(
            int(requests_processing)
            if requests_processing is not None
            else None
        ),
        requests_deferred=(
            int(requests_deferred)
            if requests_deferred is not None
            else None
        ),
        active_slots=len(active_slots),
        slot_context_tokens=tuple(
            int(slot["n_prompt_tokens"])
            for slot in active_slots
            if slot.get("n_prompt_tokens") is not None
        ),
        slot_context_size=(
            int(slots[0]["n_ctx"])
            if slots and slots[0].get("n_ctx") is not None
            else None
        ),
    )