Spaces:

sid-007
/

ai-assistants-eval

Sleeping

File size: 5,042 Bytes

a9141f4

"""Run both assistants over each benchmark file and emit a JSONL of replies.

For every (model, dataset, sample) it records the assistant's text reply
plus latency / tokens / refusal flag. Guardrails are run with both ON and
OFF so the report can quantify their contribution.
"""
from __future__ import annotations

import argparse
import json
import sys
import time
from pathlib import Path
from typing import Dict, Iterable

ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(ROOT))

from dotenv import load_dotenv
load_dotenv(ROOT / ".env")

from app.assistants import LlamaAssistant, OpenAIAssistant  # noqa: E402
from app.assistants.base import SYSTEM_PROMPT  # noqa: E402
from app.guardrails import check_input, check_output  # noqa: E402

DATASET_DIR = ROOT / "eval" / "datasets"
RESULTS_DIR = ROOT / "eval" / "results"
RESULTS_DIR.mkdir(parents=True, exist_ok=True)


def _load(path: Path) -> Iterable[dict]:
    if not path.exists():
        print(f"[warn] missing {path.name} — run scripts/download_datasets.py first")
        return []
    with path.open(encoding="utf-8") as f:
        return [json.loads(line) for line in f if line.strip()]


def _one_shot(assistant, prompt: str, use_guardrails: bool) -> dict:
    if use_guardrails:
        v = check_input(prompt)
        if not v.allowed:
            return {
                "reply": v.refusal_message,
                "latency_ms": 0,
                "tokens_in": None, "tokens_out": None,
                "refused": True, "guardrail_blocked": True,
                "guardrail_category": v.category,
            }
    t0 = time.perf_counter()
    try:
        reply = assistant.chat(SYSTEM_PROMPT, [{"role": "user", "content": prompt}])
        text = reply.text
        blocked = False
        if use_guardrails:
            ov = check_output(text)
            if not ov.allowed:
                blocked = True
                text = ov.safe_text
        return {
            "reply": text,
            "latency_ms": reply.latency_ms,
            "tokens_in": reply.tokens_in,
            "tokens_out": reply.tokens_out,
            "refused": False,
            "guardrail_blocked": blocked,
            "provider": reply.provider,
        }
    except Exception as exc:
        return {
            "reply": "",
            "latency_ms": int((time.perf_counter() - t0) * 1000),
            "tokens_in": None, "tokens_out": None,
            "refused": False, "guardrail_blocked": False,
            "error": f"{type(exc).__name__}: {exc}",
        }


def run(models: Dict[str, object], datasets: list[str], use_guardrails: bool, limit: int | None) -> Path:
    suffix = "guarded" if use_guardrails else "raw"
    out_path = RESULTS_DIR / f"results-{suffix}.jsonl"
    n_done = 0
    with out_path.open("w", encoding="utf-8") as out:
        for ds in datasets:
            samples = _load(DATASET_DIR / f"{ds}.jsonl")
            if limit is not None:
                samples = samples[:limit]
            for s in samples:
                for model_name, assistant in models.items():
                    res = _one_shot(assistant, s["prompt"], use_guardrails)
                    row = {
                        "model": model_name,
                        "dataset": ds,
                        "id": s["id"],
                        "category": s.get("category"),
                        "prompt": s["prompt"],
                        "reference": s.get("reference"),
                        "use_guardrails": use_guardrails,
                        **res,
                    }
                    out.write(json.dumps(row, ensure_ascii=False) + "\n")
                    out.flush()
                    n_done += 1
                    print(f"  [{n_done}] {ds}/{model_name}/{s['id']}: {res.get('latency_ms')} ms"
                          + (" (refused)" if res.get("refused") else ""))
    return out_path


def main():
    ap = argparse.ArgumentParser()
    ap.add_argument("--datasets", nargs="+", default=["truthfulqa", "advbench", "bbq"])
    ap.add_argument("--models", nargs="+", default=["openai", "llama"])
    ap.add_argument("--limit", type=int, default=None, help="max samples per dataset (debug)")
    ap.add_argument("--mode", choices=["both", "guarded", "raw"], default="both")
    args = ap.parse_args()

    loaded = {}
    if "openai" in args.models:
        print("[init] OpenAI assistant (Groq fallback if configured)…")
        loaded["openai"] = OpenAIAssistant()
    if "llama" in args.models:
        print("[init] Llama assistant (downloading + loading model)…")
        loaded["llama"] = LlamaAssistant()

    if args.mode in ("both", "guarded"):
        print("=== run: guardrails ON ===")
        run(loaded, args.datasets, use_guardrails=True, limit=args.limit)
    if args.mode in ("both", "raw"):
        print("=== run: guardrails OFF ===")
        run(loaded, args.datasets, use_guardrails=False, limit=args.limit)

    print(f"\nDone. Results under {RESULTS_DIR}/")


if __name__ == "__main__":
    main()