# cpu latency benchmark for the exported onnx. RUN LOCALLY on the target # laptop. gpu timings are not informative because deployment is cpu-only via # the rust ort runtime. mirrors privacy-filter/scripts/05_benchmark.py. import argparse import json from pathlib import Path from transformers import AutoTokenizer from cleanup.eval.latency import benchmark_latency, benchmark_realistic def _resolve_model_path(run_dir: Path, which: str) -> Path: if which == "int8": int8_path = run_dir / "onnx" / "int8" / "model.onnx" if not int8_path.exists(): raise FileNotFoundError(f"no int8 onnx at {int8_path}; run scripts/04_export.py") return int8_path fp32_path = run_dir / "onnx" / "model.onnx" if not fp32_path.exists(): raise FileNotFoundError(f"no fp32 onnx at {fp32_path}; run scripts/04_export.py") return fp32_path def _load_realistic_texts(data_dir: Path, n: int) -> list[str]: test_path = Path(data_dir) / "test.json" if not test_path.exists(): return [] rows = json.loads(test_path.read_text()) if n < len(rows): rows = rows[:n] return [r["raw"] for r in rows] def main() -> None: parser = argparse.ArgumentParser() parser.add_argument("--runs-dir", default="runs") parser.add_argument("--run-id", required=True) parser.add_argument("--data-dir", default="data/pairs") parser.add_argument("--model", choices=["fp32", "int8"], default="fp32") parser.add_argument("--threads", type=int, default=4) parser.add_argument("--warmup", type=int, default=50) parser.add_argument("--measure", type=int, default=500) parser.add_argument("--realistic-samples", type=int, default=500) args = parser.parse_args() run_dir = Path(args.runs_dir) / args.run_id model_path = _resolve_model_path(run_dir, args.model) print(f"[bench] {args.model} model: {model_path}") print(f"[bench] file size {model_path.stat().st_size / 1e6:.1f} MB") tokenizer = AutoTokenizer.from_pretrained(run_dir / "model", use_fast=True) print("[bench] fixed length sweep") sweep = benchmark_latency( onnx_path=model_path, tokenizer=tokenizer, warmup=args.warmup, measure=args.measure, intra_op_threads=args.threads, ) realistic = None texts = _load_realistic_texts(Path(args.data_dir), args.realistic_samples) if texts: print(f"[bench] realistic mix on {len(texts)} real test rows") realistic = benchmark_realistic( onnx_path=model_path, tokenizer=tokenizer, texts=texts, intra_op_threads=args.threads, ) out_path = run_dir / "latency_benchmark.json" out_path.write_text(json.dumps( { "model": args.model, "model_path": str(model_path), "model_size_bytes": model_path.stat().st_size, "intra_op_threads": args.threads, "results_by_length": sweep, "realistic_mix": realistic, }, indent=2, )) print(f"[bench] wrote {out_path}") print() print("length | p50 ms | p95 ms | p99 ms | mean ms") for length, stats in sweep.items(): print( f"{length:>6s} | {stats['p50_ms']:>6.2f} | {stats['p95_ms']:>6.2f} | " f"{stats['p99_ms']:>6.2f} | {stats['mean_ms']:>7.2f}" ) if realistic: print() print( f"realistic mix ({realistic['samples']} rows, " f"p50 length {realistic['token_length_p50']}): " f"p50={realistic['p50_ms']:.2f}ms p95={realistic['p95_ms']:.2f}ms " f"p99={realistic['p99_ms']:.2f}ms" ) if __name__ == "__main__": main()