File size: 4,961 Bytes
32a197f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
"""
KernelX Intelligence Layer — Latency Benchmark

Measures end-to-end inference latency of the quantized Strategist model
on the target CPU hardware. Reports mean, P50, P95, P99, and max latency.

Usage:
    python -m training.inference.benchmark_latency \
        --model training/models/strategist_merged/strategist-q4km.gguf \
        --samples 200
"""

import argparse
import json
import sys
import time
from pathlib import Path

import numpy as np

sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
from training.data.preprocess import FEATURE_NAMES, format_state, load_config
from training.inference.strategy_engine import build_inference_prompt, parse_output

CONFIG = load_config()


def benchmark(
    model_path: str,
    test_data_path: str = None,
    n_samples: int = 200,
    n_threads: int = 4,
    temperature: float = 0.2,
    max_tokens: int = 64,
    warmup: int = 5,
):
    """Run latency benchmark on the quantized GGUF model."""
    from llama_cpp import Llama

    print(f"Loading model: {model_path}")
    llm = Llama(model_path=model_path, n_ctx=512, n_threads=n_threads, verbose=False)

    # Build test prompts
    if test_data_path:
        records = [json.loads(l) for l in open(test_data_path) if l.strip()]
    else:
        # Synthetic test data
        records = []
        for i in range(n_samples):
            state = [float(i % 16)]  # cpu
            state += [120.0, 120.0, 120.0]  # priorities
            state += [20.0 + i * 0.1, 28.0 + i * 0.01, 8.0 + i * 0.001]  # symlog'd
            state += [16.0, float(i % 50), float(5 + i % 30)]  # cpus, csw, wt_us
            records.append({"state": state, "pid": 1000 + i, "cpu": i % 16})

    records = records[:n_samples]
    prompts = []
    for rec in records:
        prompts.append(build_inference_prompt(rec["state"], rec["pid"], rec["cpu"]))

    # Warmup
    print(f"Warming up ({warmup} iterations) ...")
    for i in range(warmup):
        llm(prompts[i % len(prompts)], max_tokens=max_tokens, temperature=temperature)

    # Benchmark
    print(f"\nBenchmarking {len(prompts)} samples ...")
    latencies = []
    format_ok = 0
    token_counts = []

    for prompt in prompts:
        start = time.perf_counter()
        output = llm(prompt, max_tokens=max_tokens, temperature=temperature)
        elapsed = time.perf_counter() - start
        latencies.append(elapsed)

        text = output["choices"][0]["text"]
        tokens = output["usage"]["completion_tokens"]
        token_counts.append(tokens)

        action_val = parse_output(text)
        if -1.0 <= action_val <= 1.0:
            format_ok += 1

    latencies_ms = np.array(latencies) * 1000

    # Report
    target = CONFIG["model"]["target_inference_ms"]
    print(f"\n{'='*50}")
    print(f" KernelX Latency Benchmark")
    print(f"{'='*50}")
    print(f" Model:    {Path(model_path).name}")
    print(f" Threads:  {n_threads}")
    print(f" Samples:  {len(prompts)}")
    print(f" Target:   <{target}ms")
    print(f"{'='*50}")
    print(f" Mean:     {np.mean(latencies_ms):>8.1f} ms")
    print(f" Median:   {np.median(latencies_ms):>8.1f} ms")
    print(f" P95:      {np.percentile(latencies_ms, 95):>8.1f} ms")
    print(f" P99:      {np.percentile(latencies_ms, 99):>8.1f} ms")
    print(f" Max:      {np.max(latencies_ms):>8.1f} ms")
    print(f" Min:      {np.min(latencies_ms):>8.1f} ms")
    print(f" Std:      {np.std(latencies_ms):>8.1f} ms")
    print(f"{'='*50}")
    print(f" Tokens/s: {np.sum(token_counts) / np.sum(latencies):>8.1f}")
    print(f" Format OK:{format_ok}/{len(prompts)} ({format_ok/len(prompts)*100:.1f}%)")
    print(f"{'='*50}")

    p95 = np.percentile(latencies_ms, 95)
    if p95 <= target:
        print(f" VERDICT:  PASS (P95 {p95:.1f}ms <= {target}ms)")
    else:
        print(f" VERDICT:  FAIL (P95 {p95:.1f}ms > {target}ms)")

    return {
        "mean_ms": float(np.mean(latencies_ms)),
        "p95_ms": float(p95),
        "p99_ms": float(np.percentile(latencies_ms, 99)),
        "format_ok_pct": format_ok / len(prompts) * 100,
    }


def main():
    parser = argparse.ArgumentParser(description="Benchmark KernelX inference latency")
    parser.add_argument("--model", required=True, help="Path to GGUF model")
    parser.add_argument("--test-data", default=None, help="Test JSONL (optional)")
    parser.add_argument("--samples", type=int, default=200)
    parser.add_argument("--threads", type=int, default=4)
    parser.add_argument("--temperature", type=float, default=0.2)
    parser.add_argument("--max-tokens", type=int, default=8)
    parser.add_argument("--warmup", type=int, default=5)
    args = parser.parse_args()

    benchmark(
        model_path=args.model,
        test_data_path=args.test_data,
        n_samples=args.samples,
        n_threads=args.threads,
        temperature=args.temperature,
        max_tokens=args.max_tokens,
        warmup=args.warmup,
    )


if __name__ == "__main__":
    main()