Training pipeline scripts

32a197f verified 28 days ago

4.96 kB

	"""
	KernelX Intelligence Layer — Latency Benchmark

	Measures end-to-end inference latency of the quantized Strategist model
	on the target CPU hardware. Reports mean, P50, P95, P99, and max latency.

	Usage:
	python -m training.inference.benchmark_latency \
	--model training/models/strategist_merged/strategist-q4km.gguf \
	--samples 200
	"""

	import argparse
	import json
	import sys
	import time
	from pathlib import Path

	import numpy as np

	sys.path.insert(0, str(Path(__file__).resolve().parents[2]))
	from training.data.preprocess import FEATURE_NAMES, format_state, load_config
	from training.inference.strategy_engine import build_inference_prompt, parse_output

	CONFIG = load_config()


	def benchmark(
	model_path: str,
	test_data_path: str = None,
	n_samples: int = 200,
	n_threads: int = 4,
	temperature: float = 0.2,
	max_tokens: int = 64,
	warmup: int = 5,
	):
	"""Run latency benchmark on the quantized GGUF model."""
	from llama_cpp import Llama

	print(f"Loading model: {model_path}")
	llm = Llama(model_path=model_path, n_ctx=512, n_threads=n_threads, verbose=False)

	# Build test prompts
	if test_data_path:
	records = [json.loads(l) for l in open(test_data_path) if l.strip()]
	else:
	# Synthetic test data
	records = []
	for i in range(n_samples):
	state = [float(i % 16)] # cpu
	state += [120.0, 120.0, 120.0] # priorities
	state += [20.0 + i * 0.1, 28.0 + i * 0.01, 8.0 + i * 0.001] # symlog'd
	state += [16.0, float(i % 50), float(5 + i % 30)] # cpus, csw, wt_us
	records.append({"state": state, "pid": 1000 + i, "cpu": i % 16})

	records = records[:n_samples]
	prompts = []
	for rec in records:
	prompts.append(build_inference_prompt(rec["state"], rec["pid"], rec["cpu"]))

	# Warmup
	print(f"Warming up ({warmup} iterations) ...")
	for i in range(warmup):
	llm(prompts[i % len(prompts)], max_tokens=max_tokens, temperature=temperature)

	# Benchmark
	print(f"\nBenchmarking {len(prompts)} samples ...")
	latencies = []
	format_ok = 0
	token_counts = []

	for prompt in prompts:
	start = time.perf_counter()
	output = llm(prompt, max_tokens=max_tokens, temperature=temperature)
	elapsed = time.perf_counter() - start
	latencies.append(elapsed)

	text = output["choices"][0]["text"]
	tokens = output["usage"]["completion_tokens"]
	token_counts.append(tokens)

	action_val = parse_output(text)
	if -1.0 <= action_val <= 1.0:
	format_ok += 1

	latencies_ms = np.array(latencies) * 1000

	# Report
	target = CONFIG["model"]["target_inference_ms"]
	print(f"\n{'='*50}")
	print(f" KernelX Latency Benchmark")
	print(f"{'='*50}")
	print(f" Model: {Path(model_path).name}")
	print(f" Threads: {n_threads}")
	print(f" Samples: {len(prompts)}")
	print(f" Target: <{target}ms")
	print(f"{'='*50}")
	print(f" Mean: {np.mean(latencies_ms):>8.1f} ms")
	print(f" Median: {np.median(latencies_ms):>8.1f} ms")
	print(f" P95: {np.percentile(latencies_ms, 95):>8.1f} ms")
	print(f" P99: {np.percentile(latencies_ms, 99):>8.1f} ms")
	print(f" Max: {np.max(latencies_ms):>8.1f} ms")
	print(f" Min: {np.min(latencies_ms):>8.1f} ms")
	print(f" Std: {np.std(latencies_ms):>8.1f} ms")
	print(f"{'='*50}")
	print(f" Tokens/s: {np.sum(token_counts) / np.sum(latencies):>8.1f}")
	print(f" Format OK:{format_ok}/{len(prompts)} ({format_ok/len(prompts)*100:.1f}%)")
	print(f"{'='*50}")

	p95 = np.percentile(latencies_ms, 95)
	if p95 <= target:
	print(f" VERDICT: PASS (P95 {p95:.1f}ms <= {target}ms)")
	else:
	print(f" VERDICT: FAIL (P95 {p95:.1f}ms > {target}ms)")

	return {
	"mean_ms": float(np.mean(latencies_ms)),
	"p95_ms": float(p95),
	"p99_ms": float(np.percentile(latencies_ms, 99)),
	"format_ok_pct": format_ok / len(prompts) * 100,
	}


	def main():
	parser = argparse.ArgumentParser(description="Benchmark KernelX inference latency")
	parser.add_argument("--model", required=True, help="Path to GGUF model")
	parser.add_argument("--test-data", default=None, help="Test JSONL (optional)")
	parser.add_argument("--samples", type=int, default=200)
	parser.add_argument("--threads", type=int, default=4)
	parser.add_argument("--temperature", type=float, default=0.2)
	parser.add_argument("--max-tokens", type=int, default=8)
	parser.add_argument("--warmup", type=int, default=5)
	args = parser.parse_args()

	benchmark(
	model_path=args.model,
	test_data_path=args.test_data,
	n_samples=args.samples,
	n_threads=args.threads,
	temperature=args.temperature,
	max_tokens=args.max_tokens,
	warmup=args.warmup,
	)


	if __name__ == "__main__":
	main()