Spaces:
Running
Running
| """Run both assistants over each benchmark file and emit a JSONL of replies. | |
| For every (model, dataset, sample) it records the assistant's text reply | |
| plus latency / tokens / refusal flag. Guardrails are run with both ON and | |
| OFF so the report can quantify their contribution. | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import json | |
| import sys | |
| import time | |
| from pathlib import Path | |
| from typing import Dict, Iterable | |
| ROOT = Path(__file__).resolve().parent.parent | |
| sys.path.insert(0, str(ROOT)) | |
| from dotenv import load_dotenv | |
| load_dotenv(ROOT / ".env") | |
| from app.assistants import LlamaAssistant, OpenAIAssistant # noqa: E402 | |
| from app.assistants.base import SYSTEM_PROMPT # noqa: E402 | |
| from app.guardrails import check_input, check_output # noqa: E402 | |
| DATASET_DIR = ROOT / "eval" / "datasets" | |
| RESULTS_DIR = ROOT / "eval" / "results" | |
| RESULTS_DIR.mkdir(parents=True, exist_ok=True) | |
| def _load(path: Path) -> Iterable[dict]: | |
| if not path.exists(): | |
| print(f"[warn] missing {path.name} — run scripts/download_datasets.py first") | |
| return [] | |
| with path.open(encoding="utf-8") as f: | |
| return [json.loads(line) for line in f if line.strip()] | |
| def _one_shot(assistant, prompt: str, use_guardrails: bool) -> dict: | |
| if use_guardrails: | |
| v = check_input(prompt) | |
| if not v.allowed: | |
| return { | |
| "reply": v.refusal_message, | |
| "latency_ms": 0, | |
| "tokens_in": None, "tokens_out": None, | |
| "refused": True, "guardrail_blocked": True, | |
| "guardrail_category": v.category, | |
| } | |
| t0 = time.perf_counter() | |
| try: | |
| reply = assistant.chat(SYSTEM_PROMPT, [{"role": "user", "content": prompt}]) | |
| text = reply.text | |
| blocked = False | |
| if use_guardrails: | |
| ov = check_output(text) | |
| if not ov.allowed: | |
| blocked = True | |
| text = ov.safe_text | |
| return { | |
| "reply": text, | |
| "latency_ms": reply.latency_ms, | |
| "tokens_in": reply.tokens_in, | |
| "tokens_out": reply.tokens_out, | |
| "refused": False, | |
| "guardrail_blocked": blocked, | |
| "provider": reply.provider, | |
| } | |
| except Exception as exc: | |
| return { | |
| "reply": "", | |
| "latency_ms": int((time.perf_counter() - t0) * 1000), | |
| "tokens_in": None, "tokens_out": None, | |
| "refused": False, "guardrail_blocked": False, | |
| "error": f"{type(exc).__name__}: {exc}", | |
| } | |
| def run(models: Dict[str, object], datasets: list[str], use_guardrails: bool, limit: int | None) -> Path: | |
| suffix = "guarded" if use_guardrails else "raw" | |
| out_path = RESULTS_DIR / f"results-{suffix}.jsonl" | |
| n_done = 0 | |
| with out_path.open("w", encoding="utf-8") as out: | |
| for ds in datasets: | |
| samples = _load(DATASET_DIR / f"{ds}.jsonl") | |
| if limit is not None: | |
| samples = samples[:limit] | |
| for s in samples: | |
| for model_name, assistant in models.items(): | |
| res = _one_shot(assistant, s["prompt"], use_guardrails) | |
| row = { | |
| "model": model_name, | |
| "dataset": ds, | |
| "id": s["id"], | |
| "category": s.get("category"), | |
| "prompt": s["prompt"], | |
| "reference": s.get("reference"), | |
| "use_guardrails": use_guardrails, | |
| **res, | |
| } | |
| out.write(json.dumps(row, ensure_ascii=False) + "\n") | |
| out.flush() | |
| n_done += 1 | |
| print(f" [{n_done}] {ds}/{model_name}/{s['id']}: {res.get('latency_ms')} ms" | |
| + (" (refused)" if res.get("refused") else "")) | |
| return out_path | |
| def main(): | |
| ap = argparse.ArgumentParser() | |
| ap.add_argument("--datasets", nargs="+", default=["truthfulqa", "advbench", "bbq"]) | |
| ap.add_argument("--models", nargs="+", default=["openai", "llama"]) | |
| ap.add_argument("--limit", type=int, default=None, help="max samples per dataset (debug)") | |
| ap.add_argument("--mode", choices=["both", "guarded", "raw"], default="both") | |
| args = ap.parse_args() | |
| loaded = {} | |
| if "openai" in args.models: | |
| print("[init] OpenAI assistant (Groq fallback if configured)…") | |
| loaded["openai"] = OpenAIAssistant() | |
| if "llama" in args.models: | |
| print("[init] Llama assistant (downloading + loading model)…") | |
| loaded["llama"] = LlamaAssistant() | |
| if args.mode in ("both", "guarded"): | |
| print("=== run: guardrails ON ===") | |
| run(loaded, args.datasets, use_guardrails=True, limit=args.limit) | |
| if args.mode in ("both", "raw"): | |
| print("=== run: guardrails OFF ===") | |
| run(loaded, args.datasets, use_guardrails=False, limit=args.limit) | |
| print(f"\nDone. Results under {RESULTS_DIR}/") | |
| if __name__ == "__main__": | |
| main() | |