File size: 4,961 Bytes

32a197f

"""
KernelX Intelligence Layer — Latency Benchmark

Measures end-to-end inference latency of the quantized Strategist model
on the target CPU hardware. Reports mean, P50, P95, P99, and max latency.

Usage:
    python -m training.inference.benchmark_latency \
        --model training/models/strategist_merged/strategist-q4km.gguf \
        --samples 200
"""

import argparse
import json
import sys
import time
from pathlib import Path

import numpy as np

sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
from training.data.preprocess import FEATURE_NAMES, format_state, load_config
from training.inference.strategy_engine import build_inference_prompt, parse_output

CONFIG = load_config()


def benchmark(
    model_path: str,
    test_data_path: str = None,
    n_samples: int = 200,
    n_threads: int = 4,
    temperature: float = 0.2,
    max_tokens: int = 64,
    warmup: int = 5,
):
    """Run latency benchmark on the quantized GGUF model."""
    from llama_cpp import Llama

    print(f"Loading model: {model_path}")
    llm = Llama(model_path=model_path, n_ctx=512, n_threads=n_threads, verbose=False)

    # Build test prompts
    if test_data_path:
        records = [json.loads(l) for l in open(test_data_path) if l.strip()]
    else:
        # Synthetic test data
        records = []
        for i in range(n_samples):
            state = [float(i % 16)]  # cpu
            state += [120.0, 120.0, 120.0]  # priorities
            state += [20.0 + i * 0.1, 28.0 + i * 0.01, 8.0 + i * 0.001]  # symlog'd
            state += [16.0, float(i % 50), float(5 + i % 30)]  # cpus, csw, wt_us
            records.append({"state": state, "pid": 1000 + i, "cpu": i % 16})

    records = records[:n_samples]
    prompts = []
    for rec in records:
        prompts.append(build_inference_prompt(rec["state"], rec["pid"], rec["cpu"]))

    # Warmup
    print(f"Warming up ({warmup} iterations) ...")
    for i in range(warmup):
        llm(prompts[i % len(prompts)], max_tokens=max_tokens, temperature=temperature)

    # Benchmark
    print(f"\nBenchmarking {len(prompts)} samples ...")
    latencies = []
    format_ok = 0
    token_counts = []

    for prompt in prompts:
        start = time.perf_counter()
        output = llm(prompt, max_tokens=max_tokens, temperature=temperature)
        elapsed = time.perf_counter() - start
        latencies.append(elapsed)

        text = output["choices"][0]["text"]
        tokens = output["usage"]["completion_tokens"]
        token_counts.append(tokens)

        action_val = parse_output(text)
        if -1.0 <= action_val <= 1.0:
            format_ok += 1

    latencies_ms = np.array(latencies) * 1000

    # Report
    target = CONFIG["model"]["target_inference_ms"]
    print(f"\n{'='*50}")
    print(f" KernelX Latency Benchmark")
    print(f"{'='*50}")
    print(f" Model:    {Path(model_path).name}")
    print(f" Threads:  {n_threads}")
    print(f" Samples:  {len(prompts)}")
    print(f" Target:   <{target}ms")
    print(f"{'='*50}")
    print(f" Mean:     {np.mean(latencies_ms):>8.1f} ms")
    print(f" Median:   {np.median(latencies_ms):>8.1f} ms")
    print(f" P95:      {np.percentile(latencies_ms, 95):>8.1f} ms")
    print(f" P99:      {np.percentile(latencies_ms, 99):>8.1f} ms")
    print(f" Max:      {np.max(latencies_ms):>8.1f} ms")
    print(f" Min:      {np.min(latencies_ms):>8.1f} ms")
    print(f" Std:      {np.std(latencies_ms):>8.1f} ms")
    print(f"{'='*50}")
    print(f" Tokens/s: {np.sum(token_counts) / np.sum(latencies):>8.1f}")
    print(f" Format OK:{format_ok}/{len(prompts)} ({format_ok/len(prompts)*100:.1f}%)")
    print(f"{'='*50}")

    p95 = np.percentile(latencies_ms, 95)
    if p95 <= target:
        print(f" VERDICT:  PASS (P95 {p95:.1f}ms <= {target}ms)")
    else:
        print(f" VERDICT:  FAIL (P95 {p95:.1f}ms > {target}ms)")

    return {
        "mean_ms": float(np.mean(latencies_ms)),
        "p95_ms": float(p95),
        "p99_ms": float(np.percentile(latencies_ms, 99)),
        "format_ok_pct": format_ok / len(prompts) * 100,
    }


def main():
    parser = argparse.ArgumentParser(description="Benchmark KernelX inference latency")
    parser.add_argument("--model", required=True, help="Path to GGUF model")
    parser.add_argument("--test-data", default=None, help="Test JSONL (optional)")
    parser.add_argument("--samples", type=int, default=200)
    parser.add_argument("--threads", type=int, default=4)
    parser.add_argument("--temperature", type=float, default=0.2)
    parser.add_argument("--max-tokens", type=int, default=8)
    parser.add_argument("--warmup", type=int, default=5)
    args = parser.parse_args()

    benchmark(
        model_path=args.model,
        test_data_path=args.test_data,
        n_samples=args.samples,
        n_threads=args.threads,
        temperature=args.temperature,
        max_tokens=args.max_tokens,
        warmup=args.warmup,
    )


if __name__ == "__main__":
    main()