| """ |
| KernelX Intelligence Layer — Latency Benchmark |
| |
| Measures end-to-end inference latency of the quantized Strategist model |
| on the target CPU hardware. Reports mean, P50, P95, P99, and max latency. |
| |
| Usage: |
| python -m training.inference.benchmark_latency \ |
| --model training/models/strategist_merged/strategist-q4km.gguf \ |
| --samples 200 |
| """ |
|
|
| import argparse |
| import json |
| import sys |
| import time |
| from pathlib import Path |
|
|
| import numpy as np |
|
|
| sys.path.insert(0, str(Path(__file__).resolve().parents[2])) |
| from training.data.preprocess import FEATURE_NAMES, format_state, load_config |
| from training.inference.strategy_engine import build_inference_prompt, parse_output |
|
|
| CONFIG = load_config() |
|
|
|
|
| def benchmark( |
| model_path: str, |
| test_data_path: str = None, |
| n_samples: int = 200, |
| n_threads: int = 4, |
| temperature: float = 0.2, |
| max_tokens: int = 64, |
| warmup: int = 5, |
| ): |
| """Run latency benchmark on the quantized GGUF model.""" |
| from llama_cpp import Llama |
|
|
| print(f"Loading model: {model_path}") |
| llm = Llama(model_path=model_path, n_ctx=512, n_threads=n_threads, verbose=False) |
|
|
| |
| if test_data_path: |
| records = [json.loads(l) for l in open(test_data_path) if l.strip()] |
| else: |
| |
| records = [] |
| for i in range(n_samples): |
| state = [float(i % 16)] |
| state += [120.0, 120.0, 120.0] |
| state += [20.0 + i * 0.1, 28.0 + i * 0.01, 8.0 + i * 0.001] |
| state += [16.0, float(i % 50), float(5 + i % 30)] |
| records.append({"state": state, "pid": 1000 + i, "cpu": i % 16}) |
|
|
| records = records[:n_samples] |
| prompts = [] |
| for rec in records: |
| prompts.append(build_inference_prompt(rec["state"], rec["pid"], rec["cpu"])) |
|
|
| |
| print(f"Warming up ({warmup} iterations) ...") |
| for i in range(warmup): |
| llm(prompts[i % len(prompts)], max_tokens=max_tokens, temperature=temperature) |
|
|
| |
| print(f"\nBenchmarking {len(prompts)} samples ...") |
| latencies = [] |
| format_ok = 0 |
| token_counts = [] |
|
|
| for prompt in prompts: |
| start = time.perf_counter() |
| output = llm(prompt, max_tokens=max_tokens, temperature=temperature) |
| elapsed = time.perf_counter() - start |
| latencies.append(elapsed) |
|
|
| text = output["choices"][0]["text"] |
| tokens = output["usage"]["completion_tokens"] |
| token_counts.append(tokens) |
|
|
| action_val = parse_output(text) |
| if -1.0 <= action_val <= 1.0: |
| format_ok += 1 |
|
|
| latencies_ms = np.array(latencies) * 1000 |
|
|
| |
| target = CONFIG["model"]["target_inference_ms"] |
| print(f"\n{'='*50}") |
| print(f" KernelX Latency Benchmark") |
| print(f"{'='*50}") |
| print(f" Model: {Path(model_path).name}") |
| print(f" Threads: {n_threads}") |
| print(f" Samples: {len(prompts)}") |
| print(f" Target: <{target}ms") |
| print(f"{'='*50}") |
| print(f" Mean: {np.mean(latencies_ms):>8.1f} ms") |
| print(f" Median: {np.median(latencies_ms):>8.1f} ms") |
| print(f" P95: {np.percentile(latencies_ms, 95):>8.1f} ms") |
| print(f" P99: {np.percentile(latencies_ms, 99):>8.1f} ms") |
| print(f" Max: {np.max(latencies_ms):>8.1f} ms") |
| print(f" Min: {np.min(latencies_ms):>8.1f} ms") |
| print(f" Std: {np.std(latencies_ms):>8.1f} ms") |
| print(f"{'='*50}") |
| print(f" Tokens/s: {np.sum(token_counts) / np.sum(latencies):>8.1f}") |
| print(f" Format OK:{format_ok}/{len(prompts)} ({format_ok/len(prompts)*100:.1f}%)") |
| print(f"{'='*50}") |
|
|
| p95 = np.percentile(latencies_ms, 95) |
| if p95 <= target: |
| print(f" VERDICT: PASS (P95 {p95:.1f}ms <= {target}ms)") |
| else: |
| print(f" VERDICT: FAIL (P95 {p95:.1f}ms > {target}ms)") |
|
|
| return { |
| "mean_ms": float(np.mean(latencies_ms)), |
| "p95_ms": float(p95), |
| "p99_ms": float(np.percentile(latencies_ms, 99)), |
| "format_ok_pct": format_ok / len(prompts) * 100, |
| } |
|
|
|
|
| def main(): |
| parser = argparse.ArgumentParser(description="Benchmark KernelX inference latency") |
| parser.add_argument("--model", required=True, help="Path to GGUF model") |
| parser.add_argument("--test-data", default=None, help="Test JSONL (optional)") |
| parser.add_argument("--samples", type=int, default=200) |
| parser.add_argument("--threads", type=int, default=4) |
| parser.add_argument("--temperature", type=float, default=0.2) |
| parser.add_argument("--max-tokens", type=int, default=8) |
| parser.add_argument("--warmup", type=int, default=5) |
| args = parser.parse_args() |
|
|
| benchmark( |
| model_path=args.model, |
| test_data_path=args.test_data, |
| n_samples=args.samples, |
| n_threads=args.threads, |
| temperature=args.temperature, |
| max_tokens=args.max_tokens, |
| warmup=args.warmup, |
| ) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|