# cpu latency for the exported onnx, mirroring privacy-filter/eval/latency.py. # RUN THIS LOCALLY on the target laptop. gpu timings are not informative # because deployment is cpu-only via ort. # # two benchmarks: # - fixed length sweep: synthesize prompts of N tokens, time N=16..512 # - realistic mix: use real test rows, time variable-length inputs import statistics import time from pathlib import Path from typing import Optional import numpy as np import onnxruntime as ort from transformers import AutoTokenizer def _session(onnx_path: Path, intra_op_threads: int) -> ort.InferenceSession: opts = ort.SessionOptions() opts.intra_op_num_threads = intra_op_threads opts.inter_op_num_threads = 1 opts.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL return ort.InferenceSession(str(onnx_path), opts, providers=["CPUExecutionProvider"]) def _percentiles(values: list[float]) -> dict: if not values: return {"p50_ms": 0.0, "p95_ms": 0.0, "p99_ms": 0.0, "mean_ms": 0.0} arr = np.asarray(values) return { "p50_ms": float(np.percentile(arr, 50)), "p95_ms": float(np.percentile(arr, 95)), "p99_ms": float(np.percentile(arr, 99)), "mean_ms": float(np.mean(arr)), } def _one_call_ms(session: ort.InferenceSession, input_ids: np.ndarray) -> float: attn = np.ones_like(input_ids) t0 = time.perf_counter() session.run( None, {"input_ids": input_ids, "attention_mask": attn}, ) return (time.perf_counter() - t0) * 1000.0 def benchmark_latency( onnx_path: Path, tokenizer, warmup: int = 50, measure: int = 500, intra_op_threads: int = 4, lengths: tuple = (16, 32, 64, 128, 256, 512), ) -> dict: session = _session(onnx_path, intra_op_threads) results = {} pad = tokenizer.pad_token_id or 0 for length in lengths: # synthesize a sequence of `length` tokens (alternating pad and a fixed # ascii token id so the model has structure to attend to). ids = np.full((1, length), pad, dtype=np.int64) ids[0, ::2] = tokenizer.encode("the")[0] for _ in range(warmup): _one_call_ms(session, ids) samples = [_one_call_ms(session, ids) for _ in range(measure)] results[str(length)] = _percentiles(samples) return results def benchmark_realistic( onnx_path: Path, tokenizer, texts: list[str], intra_op_threads: int = 4, warmup: int = 20, ) -> dict: session = _session(onnx_path, intra_op_threads) sequences = [ np.asarray(tokenizer.encode(t, max_length=512, truncation=True), dtype=np.int64).reshape(1, -1) for t in texts ] # warmup uses the first n inputs for i in range(min(warmup, len(sequences))): _one_call_ms(session, sequences[i]) samples = [_one_call_ms(session, s) for s in sequences] lengths = [s.shape[1] for s in sequences] out = _percentiles(samples) out["samples"] = len(samples) out["token_length_p50"] = int(statistics.median(lengths)) out["token_length_p95"] = int(np.percentile(lengths, 95)) return out