kernelx-strategist / training /inference /benchmark_latency.py
Rayugacodes's picture
Training pipeline scripts
32a197f verified
"""
KernelX Intelligence Layer — Latency Benchmark
Measures end-to-end inference latency of the quantized Strategist model
on the target CPU hardware. Reports mean, P50, P95, P99, and max latency.
Usage:
python -m training.inference.benchmark_latency \
--model training/models/strategist_merged/strategist-q4km.gguf \
--samples 200
"""
import argparse
import json
import sys
import time
from pathlib import Path
import numpy as np
sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
from training.data.preprocess import FEATURE_NAMES, format_state, load_config
from training.inference.strategy_engine import build_inference_prompt, parse_output
CONFIG = load_config()
def benchmark(
model_path: str,
test_data_path: str = None,
n_samples: int = 200,
n_threads: int = 4,
temperature: float = 0.2,
max_tokens: int = 64,
warmup: int = 5,
):
"""Run latency benchmark on the quantized GGUF model."""
from llama_cpp import Llama
print(f"Loading model: {model_path}")
llm = Llama(model_path=model_path, n_ctx=512, n_threads=n_threads, verbose=False)
# Build test prompts
if test_data_path:
records = [json.loads(l) for l in open(test_data_path) if l.strip()]
else:
# Synthetic test data
records = []
for i in range(n_samples):
state = [float(i % 16)] # cpu
state += [120.0, 120.0, 120.0] # priorities
state += [20.0 + i * 0.1, 28.0 + i * 0.01, 8.0 + i * 0.001] # symlog'd
state += [16.0, float(i % 50), float(5 + i % 30)] # cpus, csw, wt_us
records.append({"state": state, "pid": 1000 + i, "cpu": i % 16})
records = records[:n_samples]
prompts = []
for rec in records:
prompts.append(build_inference_prompt(rec["state"], rec["pid"], rec["cpu"]))
# Warmup
print(f"Warming up ({warmup} iterations) ...")
for i in range(warmup):
llm(prompts[i % len(prompts)], max_tokens=max_tokens, temperature=temperature)
# Benchmark
print(f"\nBenchmarking {len(prompts)} samples ...")
latencies = []
format_ok = 0
token_counts = []
for prompt in prompts:
start = time.perf_counter()
output = llm(prompt, max_tokens=max_tokens, temperature=temperature)
elapsed = time.perf_counter() - start
latencies.append(elapsed)
text = output["choices"][0]["text"]
tokens = output["usage"]["completion_tokens"]
token_counts.append(tokens)
action_val = parse_output(text)
if -1.0 <= action_val <= 1.0:
format_ok += 1
latencies_ms = np.array(latencies) * 1000
# Report
target = CONFIG["model"]["target_inference_ms"]
print(f"\n{'='*50}")
print(f" KernelX Latency Benchmark")
print(f"{'='*50}")
print(f" Model: {Path(model_path).name}")
print(f" Threads: {n_threads}")
print(f" Samples: {len(prompts)}")
print(f" Target: <{target}ms")
print(f"{'='*50}")
print(f" Mean: {np.mean(latencies_ms):>8.1f} ms")
print(f" Median: {np.median(latencies_ms):>8.1f} ms")
print(f" P95: {np.percentile(latencies_ms, 95):>8.1f} ms")
print(f" P99: {np.percentile(latencies_ms, 99):>8.1f} ms")
print(f" Max: {np.max(latencies_ms):>8.1f} ms")
print(f" Min: {np.min(latencies_ms):>8.1f} ms")
print(f" Std: {np.std(latencies_ms):>8.1f} ms")
print(f"{'='*50}")
print(f" Tokens/s: {np.sum(token_counts) / np.sum(latencies):>8.1f}")
print(f" Format OK:{format_ok}/{len(prompts)} ({format_ok/len(prompts)*100:.1f}%)")
print(f"{'='*50}")
p95 = np.percentile(latencies_ms, 95)
if p95 <= target:
print(f" VERDICT: PASS (P95 {p95:.1f}ms <= {target}ms)")
else:
print(f" VERDICT: FAIL (P95 {p95:.1f}ms > {target}ms)")
return {
"mean_ms": float(np.mean(latencies_ms)),
"p95_ms": float(p95),
"p99_ms": float(np.percentile(latencies_ms, 99)),
"format_ok_pct": format_ok / len(prompts) * 100,
}
def main():
parser = argparse.ArgumentParser(description="Benchmark KernelX inference latency")
parser.add_argument("--model", required=True, help="Path to GGUF model")
parser.add_argument("--test-data", default=None, help="Test JSONL (optional)")
parser.add_argument("--samples", type=int, default=200)
parser.add_argument("--threads", type=int, default=4)
parser.add_argument("--temperature", type=float, default=0.2)
parser.add_argument("--max-tokens", type=int, default=8)
parser.add_argument("--warmup", type=int, default=5)
args = parser.parse_args()
benchmark(
model_path=args.model,
test_data_path=args.test_data,
n_samples=args.samples,
n_threads=args.threads,
temperature=args.temperature,
max_tokens=args.max_tokens,
warmup=args.warmup,
)
if __name__ == "__main__":
main()