| """ |
| L2 batch evaluation — runs the golden dataset through the pipeline locally. |
| |
| Usage: |
| python eval/metrics.py # all pairs |
| python eval/metrics.py --domain retail # one domain |
| python eval/metrics.py --domain pharma # one domain |
| python eval/metrics.py --client novamart # one client |
| python eval/metrics.py --out results.json # write JSON + HTML report |
| |
| Requires HF_TOKEN in environment. |
| """ |
|
|
| import argparse |
| import json |
| import logging |
| import os |
| import sys |
| from datetime import datetime |
| from pathlib import Path |
|
|
| import yaml |
|
|
| sys.path.insert(0, str(Path(__file__).parent.parent / "backend")) |
|
|
| from huggingface_hub import InferenceClient |
| from pipeline import run |
|
|
| log = logging.getLogger(__name__) |
| logging.basicConfig(level=logging.INFO, format="%(message)s") |
|
|
| DATASET_PATH = Path(__file__).parent / "golden-dataset.yaml" |
| REPORTS_DIR = Path(__file__).parent / "reports" |
|
|
| METRIC_LABELS = { |
| "pii_leakage": "PII Leakage", |
| "token_budget": "Token Budget", |
| "answer_relevancy": "Answer Relevancy", |
| "faithfulness": "Faithfulness", |
| "chain_terminology": "Chain Terminology", |
| } |
|
|
|
|
| def load_pairs(domain: str | None = None, client: str | None = None) -> list[dict]: |
| data = yaml.safe_load(DATASET_PATH.read_text()) |
| pairs = data["pairs"] |
| if domain: |
| pairs = [p for p in pairs if p["domain"] == domain] |
| if client: |
| pairs = [p for p in pairs if p["client"] == client] |
| return pairs |
|
|
|
|
| def score_pair(pair: dict, hf_client: InferenceClient) -> dict: |
| """Run one golden pair through the pipeline and return scored result.""" |
| result = run( |
| query=pair["question"], |
| client=pair["client"], |
| hf_client=hf_client, |
| ) |
| payload = result.response_payload |
| metrics = payload["evaluation"]["metrics"] |
|
|
| expected = pair.get("expected_contains", []) |
| answer_lower = result.answer.lower() |
| matched = [kw for kw in expected if kw.lower() in answer_lower] |
| keyphrase_coverage = len(matched) / len(expected) if expected else 1.0 |
|
|
| return { |
| "id": pair["id"], |
| "client": pair["client"], |
| "domain": pair["domain"], |
| "question": pair["question"], |
| "answer": result.answer, |
| "keyphrase_coverage": round(keyphrase_coverage, 3), |
| "matched_keyphrases": matched, |
| "missing_keyphrases": [kw for kw in expected if kw not in matched], |
| "metrics": metrics, |
| "overall_pass": payload["evaluation"]["overall_pass"], |
| "sources": [s["title"] for s in payload["sources"]], |
| "notes": pair.get("notes", ""), |
| } |
|
|
|
|
| def print_summary(results: list[dict]) -> None: |
| metric_names = list(results[0]["metrics"].keys()) if results else [] |
| total = len(results) |
| passed = sum(1 for r in results if r["overall_pass"]) |
|
|
| log.info("\n── Summary ─────────────────────────────────────") |
| log.info("Pairs evaluated : %d", total) |
| log.info("Overall pass : %d / %d (%.0f%%)", passed, total, 100 * passed / total if total else 0) |
|
|
| log.info("\n── Per-metric pass rate ────────────────────────") |
| for name in metric_names: |
| n_pass = sum(1 for r in results if r["metrics"][name]["passed"]) |
| avg_score = sum(r["metrics"][name]["score"] for r in results) / total if total else 0 |
| log.info(" %-22s %d/%d avg %.2f", name, n_pass, total, avg_score) |
|
|
| log.info("\n── Keyphrase coverage ──────────────────────────") |
| avg_cov = sum(r["keyphrase_coverage"] for r in results) / total if total else 0 |
| log.info(" Average coverage: %.0f%%", avg_cov * 100) |
|
|
| failures = [r for r in results if not r["overall_pass"]] |
| if failures: |
| log.info("\n── Failed pairs ────────────────────────────────") |
| for r in failures: |
| failed_metrics = [m for m, v in r["metrics"].items() if not v["passed"]] |
| log.info(" [%s] %s", r["id"], ", ".join(failed_metrics)) |
|
|
|
|
| |
| |
| |
|
|
| def _score_class(score: float, metric: str) -> str: |
| if metric == "pii_leakage": |
| return "pass" if score == 1.0 else "fail" |
| if score >= 0.75: |
| return "pass" |
| if score >= 0.45: |
| return "warn" |
| return "fail" |
|
|
|
|
| def _metric_cards(metrics: dict) -> str: |
| cards = [] |
| for name, m in metrics.items(): |
| cls = _score_class(m["score"], name) |
| pct = round(m["score"] * 100) |
| label = METRIC_LABELS.get(name, name) |
| cards.append(f""" |
| <div class="metric-pill {cls}"> |
| <span class="pill-name">{label}</span> |
| <span class="pill-score">{pct}%</span> |
| </div>""") |
| return "".join(cards) |
|
|
|
|
| def _pair_row(r: dict) -> str: |
| verdict_cls = "pass" if r["overall_pass"] else "fail" |
| verdict_label = "PASS" if r["overall_pass"] else "FAIL" |
| cov_pct = round(r["keyphrase_coverage"] * 100) |
| cov_cls = "pass" if cov_pct >= 75 else ("warn" if cov_pct >= 45 else "fail") |
| missing = ", ".join(r["missing_keyphrases"]) if r["missing_keyphrases"] else "—" |
| sources = ", ".join(r["sources"]) if r["sources"] else "none" |
| return f""" |
| <div class="pair-card"> |
| <div class="pair-header"> |
| <div class="pair-id">{r['id']}</div> |
| <div class="client-badge">{r['client']}</div> |
| <div class="verdict {verdict_cls}">{verdict_label}</div> |
| </div> |
| <div class="question">Q: {r['question']}</div> |
| <div class="answer">{r['answer']}</div> |
| <div class="metrics-row">{_metric_cards(r['metrics'])}</div> |
| <div class="pair-meta"> |
| <span class="meta-item">Keyphrase coverage: <strong class="{cov_cls}-text">{cov_pct}%</strong></span> |
| <span class="meta-item">Missing: <em>{missing}</em></span> |
| <span class="meta-item">Sources: {sources}</span> |
| </div> |
| {f'<div class="notes">{r["notes"]}</div>' if r["notes"] else ""} |
| </div>""" |
|
|
|
|
| def generate_html(results: list[dict], domain: str | None) -> str: |
| total = len(results) |
| passed = sum(1 for r in results if r["overall_pass"]) |
| pass_rate = round(100 * passed / total) if total else 0 |
| metric_names = list(results[0]["metrics"].keys()) if results else [] |
| avg_cov = round(100 * sum(r["keyphrase_coverage"] for r in results) / total) if total else 0 |
| generated = datetime.now().strftime("%Y-%m-%d %H:%M") |
| title = f"Eval Report — {domain or 'all domains'}" |
|
|
| summary_pills = "" |
| for name in metric_names: |
| n_pass = sum(1 for r in results if r["metrics"][name]["passed"]) |
| avg = sum(r["metrics"][name]["score"] for r in results) / total if total else 0 |
| cls = "pass" if n_pass == total else ("warn" if n_pass >= total * 0.7 else "fail") |
| summary_pills += f""" |
| <div class="summary-metric {cls}"> |
| <div class="sm-name">{METRIC_LABELS.get(name, name)}</div> |
| <div class="sm-rate">{n_pass}/{total}</div> |
| <div class="sm-avg">avg {avg:.2f}</div> |
| </div>""" |
|
|
| pair_rows = "".join(_pair_row(r) for r in results) |
|
|
| return f"""<!DOCTYPE html> |
| <html lang="en"> |
| <head> |
| <meta charset="UTF-8"> |
| <meta name="viewport" content="width=device-width, initial-scale=1.0"> |
| <title>{title}</title> |
| <link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700;800;900&family=JetBrains+Mono:wght@400;500&display=swap" rel="stylesheet"> |
| <style> |
| * {{ margin:0; padding:0; box-sizing:border-box; }} |
| body {{ font-family:'Inter',sans-serif; background:#eef4fc; color:#1a1a1a; padding:32px; }} |
| |
| header {{ background:#fff; border-bottom:2px solid #1e3a5f; padding:20px 28px; margin-bottom:28px; border-radius:8px; }} |
| header h1 {{ font-size:24px; font-weight:900; color:#1a1a1a; letter-spacing:-0.5px; }} |
| header h1 span {{ color:#3a6ea8; }} |
| header .sub {{ font-size:11px; color:#8aabcc; margin-top:4px; }} |
| |
| .overall {{ display:flex; gap:16px; margin-bottom:24px; }} |
| .stat-card {{ background:#fff; border:1px solid #c8dff5; border-radius:8px; padding:16px 20px; flex:1; text-align:center; }} |
| .stat-card .val {{ font-size:28px; font-weight:900; color:#1e3a5f; }} |
| .stat-card .val.pass-text {{ color:#2e7d32; }} |
| .stat-card .val.fail-text {{ color:#c62828; }} |
| .stat-card .lbl {{ font-size:10px; font-weight:700; text-transform:uppercase; letter-spacing:1.5px; color:#8aabcc; margin-top:4px; }} |
| |
| .section-label {{ font-size:10px; font-weight:800; text-transform:uppercase; letter-spacing:2px; color:#8aabcc; margin:24px 0 12px; }} |
| |
| .summary-metrics {{ display:flex; gap:12px; flex-wrap:wrap; margin-bottom:28px; }} |
| .summary-metric {{ background:#fff; border:1px solid #c8dff5; border-left:3px solid #1e3a5f; border-radius:0 6px 6px 0; padding:12px 16px; flex:1; min-width:140px; }} |
| .summary-metric.pass {{ border-left-color:#4caf50; background:#f0faf3; }} |
| .summary-metric.warn {{ border-left-color:#f9a825; background:#fffdf0; }} |
| .summary-metric.fail {{ border-left-color:#c62828; background:#fdf5f5; }} |
| .sm-name {{ font-size:11px; font-weight:700; color:#1e3a5f; margin-bottom:4px; font-family:'JetBrains Mono',monospace; }} |
| .sm-rate {{ font-size:18px; font-weight:900; color:#1e3a5f; }} |
| .sm-avg {{ font-size:10px; color:#8aabcc; margin-top:2px; }} |
| .summary-metric.pass .sm-rate {{ color:#2e7d32; }} |
| .summary-metric.fail .sm-rate {{ color:#c62828; }} |
| |
| .pair-card {{ background:#fff; border:1px solid #c8dff5; border-radius:8px; padding:20px; margin-bottom:14px; }} |
| .pair-header {{ display:flex; align-items:center; gap:10px; margin-bottom:10px; }} |
| .pair-id {{ font-family:'JetBrains Mono',monospace; font-size:11px; font-weight:600; color:#3a6ea8; }} |
| .client-badge {{ font-size:10px; font-weight:700; background:#e8f2ff; color:#3a6ea8; border:1px solid #c0d8f0; padding:2px 8px; border-radius:3px; }} |
| .verdict {{ font-size:10px; font-weight:800; padding:2px 10px; border-radius:3px; margin-left:auto; }} |
| .verdict.pass {{ background:#f1f8f1; color:#2e7d32; border:1px solid #c8e6c9; }} |
| .verdict.fail {{ background:#fdf1f1; color:#c62828; border:1px solid #ffcdd2; }} |
| |
| .question {{ font-size:13px; font-weight:600; color:#1a1a1a; margin-bottom:8px; }} |
| .answer {{ font-size:12px; color:#4a6080; line-height:1.6; background:#f5f9ff; border:1px solid #e0eef8; border-radius:5px; padding:10px 12px; margin-bottom:12px; }} |
| |
| .metrics-row {{ display:flex; gap:8px; flex-wrap:wrap; margin-bottom:10px; }} |
| .metric-pill {{ display:flex; align-items:center; gap:6px; padding:4px 10px; border-radius:4px; border:1px solid; font-size:10px; font-weight:600; }} |
| .metric-pill.pass {{ background:#f1f8f1; color:#2e7d32; border-color:#c8e6c9; }} |
| .metric-pill.fail {{ background:#fdf1f1; color:#c62828; border-color:#ffcdd2; }} |
| .metric-pill.warn {{ background:#fffbf0; color:#a06000; border-color:#ffe082; }} |
| .pill-name {{ font-family:'JetBrains Mono',monospace; }} |
| .pill-score {{ font-weight:800; }} |
| |
| .pair-meta {{ display:flex; gap:16px; flex-wrap:wrap; font-size:11px; color:#6a8aaa; margin-bottom:6px; }} |
| .meta-item strong {{ font-weight:700; }} |
| .pass-text {{ color:#2e7d32; }} |
| .warn-text {{ color:#a06000; }} |
| .fail-text {{ color:#c62828; }} |
| |
| .notes {{ font-size:10.5px; color:#8aabcc; font-style:italic; border-top:1px solid #e8f2ff; padding-top:8px; margin-top:8px; }} |
| |
| footer {{ margin-top:32px; font-size:11px; color:#8aabcc; text-align:center; }} |
| </style> |
| </head> |
| <body> |
| <header> |
| <h1>AI Response <span>Validator</span> — Eval Report</h1> |
| <div class="sub">{title} · Generated {generated}</div> |
| </header> |
| |
| <div class="overall"> |
| <div class="stat-card"> |
| <div class="val">{total}</div> |
| <div class="lbl">Pairs evaluated</div> |
| </div> |
| <div class="stat-card"> |
| <div class="val {'pass-text' if pass_rate >= 75 else 'fail-text'}">{pass_rate}%</div> |
| <div class="lbl">Overall pass rate</div> |
| </div> |
| <div class="stat-card"> |
| <div class="val {'pass-text' if avg_cov >= 75 else 'warn-text'}">{avg_cov}%</div> |
| <div class="lbl">Keyphrase coverage</div> |
| </div> |
| </div> |
| |
| <div class="section-label">Per-metric summary</div> |
| <div class="summary-metrics">{summary_pills}</div> |
| |
| <div class="section-label">Pair results</div> |
| {pair_rows} |
| |
| <footer>AI Response Validator · eval/metrics.py</footer> |
| </body> |
| </html>""" |
|
|
|
|
| |
| |
| |
|
|
| def main() -> None: |
| parser = argparse.ArgumentParser(description="L2 batch evaluation against golden dataset") |
| parser.add_argument("--domain", help="Filter by domain (retail|pharma)") |
| parser.add_argument("--client", help="Filter by client id") |
| parser.add_argument("--out", help="Write JSON results to file") |
| args = parser.parse_args() |
|
|
| hf_token = os.environ.get("HF_TOKEN") |
| if not hf_token: |
| sys.exit("HF_TOKEN not set") |
|
|
| hf_client = InferenceClient(token=hf_token) |
| pairs = load_pairs(domain=args.domain, client=args.client) |
|
|
| if not pairs: |
| sys.exit("No pairs matched the given filters") |
|
|
| log.info("Evaluating %d pairs...", len(pairs)) |
| results = [] |
| for i, pair in enumerate(pairs, 1): |
| log.info("[%d/%d] %s", i, len(pairs), pair["id"]) |
| results.append(score_pair(pair, hf_client)) |
|
|
| print_summary(results) |
|
|
| REPORTS_DIR.mkdir(exist_ok=True) |
| suffix = args.domain or args.client or "all" |
| html_path = REPORTS_DIR / f"report_{suffix}.html" |
| html_path.write_text(generate_html(results, args.domain or args.client)) |
| log.info("\nHTML report: %s", html_path) |
|
|
| if args.out: |
| Path(args.out).write_text(json.dumps(results, indent=2)) |
| log.info("JSON results: %s", args.out) |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|