File size: 2,103 Bytes
7b4b748
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
from __future__ import annotations

from dataclasses import dataclass


@dataclass(frozen=True)
class InferenceMetrics:
    latency_ms: float
    ttft_ms: float | None
    tbt_ms: float | None
    input_tokens: int
    output_tokens: int
    stream_chunks: int
    tokens_per_sec: float | None = None


def compute_tokens_per_sec(
    output_tokens: int,
    latency_ms: float,
    ttft_ms: float | None,
) -> float | None:
    """Output decode throughput: output tokens divided by generation window after TTFT."""
    if output_tokens <= 0 or latency_ms <= 0:
        return None

    decode_ms = latency_ms
    if ttft_ms is not None and 0 <= ttft_ms < latency_ms:
        decode_ms = latency_ms - ttft_ms
    if decode_ms <= 0:
        decode_ms = latency_ms

    return round(output_tokens / (decode_ms / 1000.0), 2)


def _percentile(values: list[float], pct: float) -> float:
    if not values:
        return 0.0
    ordered = sorted(values)
    index = max(0, min(len(ordered) - 1, int(round((len(ordered) - 1) * pct))))
    return ordered[index]


def summarize_inference(rows: list[InferenceMetrics]) -> dict[str, float | int | None]:
    if not rows:
        return {
            "samples": 0,
            "ttft_ms": None,
            "tbt_ms": None,
            "tokens_per_sec": None,
            "latency_p50_ms": 0.0,
            "latency_p95_ms": 0.0,
            "avg_input_tokens": 0.0,
            "avg_output_tokens": 0.0,
        }

    latest = rows[-1]
    latencies = [row.latency_ms for row in rows]

    return {
        "samples": len(rows),
        "ttft_ms": round(latest.ttft_ms, 2) if latest.ttft_ms is not None else None,
        "tbt_ms": round(latest.tbt_ms, 2) if latest.tbt_ms is not None else None,
        "tokens_per_sec": latest.tokens_per_sec,
        "latency_p50_ms": round(_percentile(latencies, 0.50), 2),
        "latency_p95_ms": round(_percentile(latencies, 0.95), 2),
        "avg_input_tokens": round(sum(row.input_tokens for row in rows) / len(rows), 1),
        "avg_output_tokens": round(sum(row.output_tokens for row in rows) / len(rows), 1),
    }