# cpu latency for the exported onnx, mirroring privacy-filter/eval/latency.py.
# RUN THIS LOCALLY on the target laptop. gpu timings are not informative
# because deployment is cpu-only via ort.
#
# two benchmarks:
#   - fixed length sweep: synthesize prompts of N tokens, time N=16..512
#   - realistic mix: use real test rows, time variable-length inputs

import statistics
import time
from pathlib import Path
from typing import Optional

import numpy as np
import onnxruntime as ort
from transformers import AutoTokenizer


def _session(onnx_path: Path, intra_op_threads: int) -> ort.InferenceSession:
    opts = ort.SessionOptions()
    opts.intra_op_num_threads = intra_op_threads
    opts.inter_op_num_threads = 1
    opts.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL
    return ort.InferenceSession(str(onnx_path), opts, providers=["CPUExecutionProvider"])


def _percentiles(values: list[float]) -> dict:
    if not values:
        return {"p50_ms": 0.0, "p95_ms": 0.0, "p99_ms": 0.0, "mean_ms": 0.0}
    arr = np.asarray(values)
    return {
        "p50_ms": float(np.percentile(arr, 50)),
        "p95_ms": float(np.percentile(arr, 95)),
        "p99_ms": float(np.percentile(arr, 99)),
        "mean_ms": float(np.mean(arr)),
    }


def _one_call_ms(session: ort.InferenceSession, input_ids: np.ndarray) -> float:
    attn = np.ones_like(input_ids)
    t0 = time.perf_counter()
    session.run(
        None,
        {"input_ids": input_ids, "attention_mask": attn},
    )
    return (time.perf_counter() - t0) * 1000.0


def benchmark_latency(
    onnx_path: Path,
    tokenizer,
    warmup: int = 50,
    measure: int = 500,
    intra_op_threads: int = 4,
    lengths: tuple = (16, 32, 64, 128, 256, 512),
) -> dict:
    session = _session(onnx_path, intra_op_threads)
    results = {}
    pad = tokenizer.pad_token_id or 0
    for length in lengths:
        # synthesize a sequence of `length` tokens (alternating pad and a fixed
        # ascii token id so the model has structure to attend to).
        ids = np.full((1, length), pad, dtype=np.int64)
        ids[0, ::2] = tokenizer.encode("the")[0]
        for _ in range(warmup):
            _one_call_ms(session, ids)
        samples = [_one_call_ms(session, ids) for _ in range(measure)]
        results[str(length)] = _percentiles(samples)
    return results


def benchmark_realistic(
    onnx_path: Path,
    tokenizer,
    texts: list[str],
    intra_op_threads: int = 4,
    warmup: int = 20,
) -> dict:
    session = _session(onnx_path, intra_op_threads)
    sequences = [
        np.asarray(tokenizer.encode(t, max_length=512, truncation=True), dtype=np.int64).reshape(1, -1)
        for t in texts
    ]
    # warmup uses the first n inputs
    for i in range(min(warmup, len(sequences))):
        _one_call_ms(session, sequences[i])
    samples = [_one_call_ms(session, s) for s in sequences]
    lengths = [s.shape[1] for s in sequences]
    out = _percentiles(samples)
    out["samples"] = len(samples)
    out["token_length_p50"] = int(statistics.median(lengths))
    out["token_length_p95"] = int(np.percentile(lengths, 95))
    return out