mumble-cleanup / src /cleanup /eval /latency.py
adikuma's picture
initial upload: cleanup code and 688-pair seed dataset
fd0b01f verified
Raw
History Blame Contribute Delete
3.15 kB
# cpu latency for the exported onnx, mirroring privacy-filter/eval/latency.py.
# RUN THIS LOCALLY on the target laptop. gpu timings are not informative
# because deployment is cpu-only via ort.
#
# two benchmarks:
# - fixed length sweep: synthesize prompts of N tokens, time N=16..512
# - realistic mix: use real test rows, time variable-length inputs
import statistics
import time
from pathlib import Path
from typing import Optional
import numpy as np
import onnxruntime as ort
from transformers import AutoTokenizer
def _session(onnx_path: Path, intra_op_threads: int) -> ort.InferenceSession:
opts = ort.SessionOptions()
opts.intra_op_num_threads = intra_op_threads
opts.inter_op_num_threads = 1
opts.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
return ort.InferenceSession(str(onnx_path), opts, providers=["CPUExecutionProvider"])
def _percentiles(values: list[float]) -> dict:
if not values:
return {"p50_ms": 0.0, "p95_ms": 0.0, "p99_ms": 0.0, "mean_ms": 0.0}
arr = np.asarray(values)
return {
"p50_ms": float(np.percentile(arr, 50)),
"p95_ms": float(np.percentile(arr, 95)),
"p99_ms": float(np.percentile(arr, 99)),
"mean_ms": float(np.mean(arr)),
}
def _one_call_ms(session: ort.InferenceSession, input_ids: np.ndarray) -> float:
attn = np.ones_like(input_ids)
t0 = time.perf_counter()
session.run(
None,
{"input_ids": input_ids, "attention_mask": attn},
)
return (time.perf_counter() - t0) * 1000.0
def benchmark_latency(
onnx_path: Path,
tokenizer,
warmup: int = 50,
measure: int = 500,
intra_op_threads: int = 4,
lengths: tuple = (16, 32, 64, 128, 256, 512),
) -> dict:
session = _session(onnx_path, intra_op_threads)
results = {}
pad = tokenizer.pad_token_id or 0
for length in lengths:
# synthesize a sequence of `length` tokens (alternating pad and a fixed
# ascii token id so the model has structure to attend to).
ids = np.full((1, length), pad, dtype=np.int64)
ids[0, ::2] = tokenizer.encode("the")[0]
for _ in range(warmup):
_one_call_ms(session, ids)
samples = [_one_call_ms(session, ids) for _ in range(measure)]
results[str(length)] = _percentiles(samples)
return results
def benchmark_realistic(
onnx_path: Path,
tokenizer,
texts: list[str],
intra_op_threads: int = 4,
warmup: int = 20,
) -> dict:
session = _session(onnx_path, intra_op_threads)
sequences = [
np.asarray(tokenizer.encode(t, max_length=512, truncation=True), dtype=np.int64).reshape(1, -1)
for t in texts
]
# warmup uses the first n inputs
for i in range(min(warmup, len(sequences))):
_one_call_ms(session, sequences[i])
samples = [_one_call_ms(session, s) for s in sequences]
lengths = [s.shape[1] for s in sequences]
out = _percentiles(samples)
out["samples"] = len(samples)
out["token_length_p50"] = int(statistics.median(lengths))
out["token_length_p95"] = int(np.percentile(lengths, 95))
return out