ai-assistants-eval / eval /run_eval.py
sid-007
Deploy AI Assistants Eval — OSS vs Frontier
a9141f4
"""Run both assistants over each benchmark file and emit a JSONL of replies.
For every (model, dataset, sample) it records the assistant's text reply
plus latency / tokens / refusal flag. Guardrails are run with both ON and
OFF so the report can quantify their contribution.
"""
from __future__ import annotations
import argparse
import json
import sys
import time
from pathlib import Path
from typing import Dict, Iterable
ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(ROOT))
from dotenv import load_dotenv
load_dotenv(ROOT / ".env")
from app.assistants import LlamaAssistant, OpenAIAssistant # noqa: E402
from app.assistants.base import SYSTEM_PROMPT # noqa: E402
from app.guardrails import check_input, check_output # noqa: E402
DATASET_DIR = ROOT / "eval" / "datasets"
RESULTS_DIR = ROOT / "eval" / "results"
RESULTS_DIR.mkdir(parents=True, exist_ok=True)
def _load(path: Path) -> Iterable[dict]:
if not path.exists():
print(f"[warn] missing {path.name} — run scripts/download_datasets.py first")
return []
with path.open(encoding="utf-8") as f:
return [json.loads(line) for line in f if line.strip()]
def _one_shot(assistant, prompt: str, use_guardrails: bool) -> dict:
if use_guardrails:
v = check_input(prompt)
if not v.allowed:
return {
"reply": v.refusal_message,
"latency_ms": 0,
"tokens_in": None, "tokens_out": None,
"refused": True, "guardrail_blocked": True,
"guardrail_category": v.category,
}
t0 = time.perf_counter()
try:
reply = assistant.chat(SYSTEM_PROMPT, [{"role": "user", "content": prompt}])
text = reply.text
blocked = False
if use_guardrails:
ov = check_output(text)
if not ov.allowed:
blocked = True
text = ov.safe_text
return {
"reply": text,
"latency_ms": reply.latency_ms,
"tokens_in": reply.tokens_in,
"tokens_out": reply.tokens_out,
"refused": False,
"guardrail_blocked": blocked,
"provider": reply.provider,
}
except Exception as exc:
return {
"reply": "",
"latency_ms": int((time.perf_counter() - t0) * 1000),
"tokens_in": None, "tokens_out": None,
"refused": False, "guardrail_blocked": False,
"error": f"{type(exc).__name__}: {exc}",
}
def run(models: Dict[str, object], datasets: list[str], use_guardrails: bool, limit: int | None) -> Path:
suffix = "guarded" if use_guardrails else "raw"
out_path = RESULTS_DIR / f"results-{suffix}.jsonl"
n_done = 0
with out_path.open("w", encoding="utf-8") as out:
for ds in datasets:
samples = _load(DATASET_DIR / f"{ds}.jsonl")
if limit is not None:
samples = samples[:limit]
for s in samples:
for model_name, assistant in models.items():
res = _one_shot(assistant, s["prompt"], use_guardrails)
row = {
"model": model_name,
"dataset": ds,
"id": s["id"],
"category": s.get("category"),
"prompt": s["prompt"],
"reference": s.get("reference"),
"use_guardrails": use_guardrails,
**res,
}
out.write(json.dumps(row, ensure_ascii=False) + "\n")
out.flush()
n_done += 1
print(f" [{n_done}] {ds}/{model_name}/{s['id']}: {res.get('latency_ms')} ms"
+ (" (refused)" if res.get("refused") else ""))
return out_path
def main():
ap = argparse.ArgumentParser()
ap.add_argument("--datasets", nargs="+", default=["truthfulqa", "advbench", "bbq"])
ap.add_argument("--models", nargs="+", default=["openai", "llama"])
ap.add_argument("--limit", type=int, default=None, help="max samples per dataset (debug)")
ap.add_argument("--mode", choices=["both", "guarded", "raw"], default="both")
args = ap.parse_args()
loaded = {}
if "openai" in args.models:
print("[init] OpenAI assistant (Groq fallback if configured)…")
loaded["openai"] = OpenAIAssistant()
if "llama" in args.models:
print("[init] Llama assistant (downloading + loading model)…")
loaded["llama"] = LlamaAssistant()
if args.mode in ("both", "guarded"):
print("=== run: guardrails ON ===")
run(loaded, args.datasets, use_guardrails=True, limit=args.limit)
if args.mode in ("both", "raw"):
print("=== run: guardrails OFF ===")
run(loaded, args.datasets, use_guardrails=False, limit=args.limit)
print(f"\nDone. Results under {RESULTS_DIR}/")
if __name__ == "__main__":
main()