mbochniak01
Fix compat, bugs, and types; expand retail KB
e181667
"""
L2 batch evaluation — runs the golden dataset through the pipeline locally.
Usage:
python eval/metrics.py # all pairs
python eval/metrics.py --domain retail # one domain
python eval/metrics.py --domain pharma # one domain
python eval/metrics.py --client novamart # one client
python eval/metrics.py --out results.json # write JSON + HTML report
Requires HF_TOKEN in environment.
"""
import argparse
import json
import logging
import os
import sys
from datetime import datetime
from pathlib import Path
import yaml
sys.path.insert(0, str(Path(__file__).parent.parent / "backend"))
from huggingface_hub import InferenceClient
from pipeline import run
log = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO, format="%(message)s")
DATASET_PATH = Path(__file__).parent / "golden-dataset.yaml"
REPORTS_DIR = Path(__file__).parent / "reports"
METRIC_LABELS = {
"pii_leakage": "PII Leakage",
"token_budget": "Token Budget",
"answer_relevancy": "Answer Relevancy",
"faithfulness": "Faithfulness",
"chain_terminology": "Chain Terminology",
}
def load_pairs(domain: str | None = None, client: str | None = None) -> list[dict]:
data = yaml.safe_load(DATASET_PATH.read_text())
pairs = data["pairs"]
if domain:
pairs = [p for p in pairs if p["domain"] == domain]
if client:
pairs = [p for p in pairs if p["client"] == client]
return pairs
def score_pair(pair: dict, hf_client: InferenceClient) -> dict:
"""Run one golden pair through the pipeline and return scored result."""
result = run(
query=pair["question"],
client=pair["client"],
hf_client=hf_client,
)
payload = result.response_payload
metrics = payload["evaluation"]["metrics"]
expected = pair.get("expected_contains", [])
answer_lower = result.answer.lower()
matched = [kw for kw in expected if kw.lower() in answer_lower]
keyphrase_coverage = len(matched) / len(expected) if expected else 1.0
return {
"id": pair["id"],
"client": pair["client"],
"domain": pair["domain"],
"question": pair["question"],
"answer": result.answer,
"keyphrase_coverage": round(keyphrase_coverage, 3),
"matched_keyphrases": matched,
"missing_keyphrases": [kw for kw in expected if kw not in matched],
"metrics": metrics,
"overall_pass": payload["evaluation"]["overall_pass"],
"sources": [s["title"] for s in payload["sources"]],
"notes": pair.get("notes", ""),
}
def print_summary(results: list[dict]) -> None:
metric_names = list(results[0]["metrics"].keys()) if results else []
total = len(results)
passed = sum(1 for r in results if r["overall_pass"])
log.info("\n── Summary ─────────────────────────────────────")
log.info("Pairs evaluated : %d", total)
log.info("Overall pass : %d / %d (%.0f%%)", passed, total, 100 * passed / total if total else 0)
log.info("\n── Per-metric pass rate ────────────────────────")
for name in metric_names:
n_pass = sum(1 for r in results if r["metrics"][name]["passed"])
avg_score = sum(r["metrics"][name]["score"] for r in results) / total if total else 0
log.info(" %-22s %d/%d avg %.2f", name, n_pass, total, avg_score)
log.info("\n── Keyphrase coverage ──────────────────────────")
avg_cov = sum(r["keyphrase_coverage"] for r in results) / total if total else 0
log.info(" Average coverage: %.0f%%", avg_cov * 100)
failures = [r for r in results if not r["overall_pass"]]
if failures:
log.info("\n── Failed pairs ────────────────────────────────")
for r in failures:
failed_metrics = [m for m, v in r["metrics"].items() if not v["passed"]]
log.info(" [%s] %s", r["id"], ", ".join(failed_metrics))
# ---------------------------------------------------------------------------
# HTML report
# ---------------------------------------------------------------------------
def _score_class(score: float, metric: str) -> str:
if metric == "pii_leakage":
return "pass" if score == 1.0 else "fail"
if score >= 0.75:
return "pass"
if score >= 0.45:
return "warn"
return "fail"
def _metric_cards(metrics: dict) -> str:
cards = []
for name, m in metrics.items():
cls = _score_class(m["score"], name)
pct = round(m["score"] * 100)
label = METRIC_LABELS.get(name, name)
cards.append(f"""
<div class="metric-pill {cls}">
<span class="pill-name">{label}</span>
<span class="pill-score">{pct}%</span>
</div>""")
return "".join(cards)
def _pair_row(r: dict) -> str:
verdict_cls = "pass" if r["overall_pass"] else "fail"
verdict_label = "PASS" if r["overall_pass"] else "FAIL"
cov_pct = round(r["keyphrase_coverage"] * 100)
cov_cls = "pass" if cov_pct >= 75 else ("warn" if cov_pct >= 45 else "fail")
missing = ", ".join(r["missing_keyphrases"]) if r["missing_keyphrases"] else "—"
sources = ", ".join(r["sources"]) if r["sources"] else "none"
return f"""
<div class="pair-card">
<div class="pair-header">
<div class="pair-id">{r['id']}</div>
<div class="client-badge">{r['client']}</div>
<div class="verdict {verdict_cls}">{verdict_label}</div>
</div>
<div class="question">Q: {r['question']}</div>
<div class="answer">{r['answer']}</div>
<div class="metrics-row">{_metric_cards(r['metrics'])}</div>
<div class="pair-meta">
<span class="meta-item">Keyphrase coverage: <strong class="{cov_cls}-text">{cov_pct}%</strong></span>
<span class="meta-item">Missing: <em>{missing}</em></span>
<span class="meta-item">Sources: {sources}</span>
</div>
{f'<div class="notes">{r["notes"]}</div>' if r["notes"] else ""}
</div>"""
def generate_html(results: list[dict], domain: str | None) -> str:
total = len(results)
passed = sum(1 for r in results if r["overall_pass"])
pass_rate = round(100 * passed / total) if total else 0
metric_names = list(results[0]["metrics"].keys()) if results else []
avg_cov = round(100 * sum(r["keyphrase_coverage"] for r in results) / total) if total else 0
generated = datetime.now().strftime("%Y-%m-%d %H:%M")
title = f"Eval Report — {domain or 'all domains'}"
summary_pills = ""
for name in metric_names:
n_pass = sum(1 for r in results if r["metrics"][name]["passed"])
avg = sum(r["metrics"][name]["score"] for r in results) / total if total else 0
cls = "pass" if n_pass == total else ("warn" if n_pass >= total * 0.7 else "fail")
summary_pills += f"""
<div class="summary-metric {cls}">
<div class="sm-name">{METRIC_LABELS.get(name, name)}</div>
<div class="sm-rate">{n_pass}/{total}</div>
<div class="sm-avg">avg {avg:.2f}</div>
</div>"""
pair_rows = "".join(_pair_row(r) for r in results)
return f"""<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>{title}</title>
<link href="https://fonts.googleapis.com/css2?family=Inter:wght@400;500;600;700;800;900&family=JetBrains+Mono:wght@400;500&display=swap" rel="stylesheet">
<style>
* {{ margin:0; padding:0; box-sizing:border-box; }}
body {{ font-family:'Inter',sans-serif; background:#eef4fc; color:#1a1a1a; padding:32px; }}
header {{ background:#fff; border-bottom:2px solid #1e3a5f; padding:20px 28px; margin-bottom:28px; border-radius:8px; }}
header h1 {{ font-size:24px; font-weight:900; color:#1a1a1a; letter-spacing:-0.5px; }}
header h1 span {{ color:#3a6ea8; }}
header .sub {{ font-size:11px; color:#8aabcc; margin-top:4px; }}
.overall {{ display:flex; gap:16px; margin-bottom:24px; }}
.stat-card {{ background:#fff; border:1px solid #c8dff5; border-radius:8px; padding:16px 20px; flex:1; text-align:center; }}
.stat-card .val {{ font-size:28px; font-weight:900; color:#1e3a5f; }}
.stat-card .val.pass-text {{ color:#2e7d32; }}
.stat-card .val.fail-text {{ color:#c62828; }}
.stat-card .lbl {{ font-size:10px; font-weight:700; text-transform:uppercase; letter-spacing:1.5px; color:#8aabcc; margin-top:4px; }}
.section-label {{ font-size:10px; font-weight:800; text-transform:uppercase; letter-spacing:2px; color:#8aabcc; margin:24px 0 12px; }}
.summary-metrics {{ display:flex; gap:12px; flex-wrap:wrap; margin-bottom:28px; }}
.summary-metric {{ background:#fff; border:1px solid #c8dff5; border-left:3px solid #1e3a5f; border-radius:0 6px 6px 0; padding:12px 16px; flex:1; min-width:140px; }}
.summary-metric.pass {{ border-left-color:#4caf50; background:#f0faf3; }}
.summary-metric.warn {{ border-left-color:#f9a825; background:#fffdf0; }}
.summary-metric.fail {{ border-left-color:#c62828; background:#fdf5f5; }}
.sm-name {{ font-size:11px; font-weight:700; color:#1e3a5f; margin-bottom:4px; font-family:'JetBrains Mono',monospace; }}
.sm-rate {{ font-size:18px; font-weight:900; color:#1e3a5f; }}
.sm-avg {{ font-size:10px; color:#8aabcc; margin-top:2px; }}
.summary-metric.pass .sm-rate {{ color:#2e7d32; }}
.summary-metric.fail .sm-rate {{ color:#c62828; }}
.pair-card {{ background:#fff; border:1px solid #c8dff5; border-radius:8px; padding:20px; margin-bottom:14px; }}
.pair-header {{ display:flex; align-items:center; gap:10px; margin-bottom:10px; }}
.pair-id {{ font-family:'JetBrains Mono',monospace; font-size:11px; font-weight:600; color:#3a6ea8; }}
.client-badge {{ font-size:10px; font-weight:700; background:#e8f2ff; color:#3a6ea8; border:1px solid #c0d8f0; padding:2px 8px; border-radius:3px; }}
.verdict {{ font-size:10px; font-weight:800; padding:2px 10px; border-radius:3px; margin-left:auto; }}
.verdict.pass {{ background:#f1f8f1; color:#2e7d32; border:1px solid #c8e6c9; }}
.verdict.fail {{ background:#fdf1f1; color:#c62828; border:1px solid #ffcdd2; }}
.question {{ font-size:13px; font-weight:600; color:#1a1a1a; margin-bottom:8px; }}
.answer {{ font-size:12px; color:#4a6080; line-height:1.6; background:#f5f9ff; border:1px solid #e0eef8; border-radius:5px; padding:10px 12px; margin-bottom:12px; }}
.metrics-row {{ display:flex; gap:8px; flex-wrap:wrap; margin-bottom:10px; }}
.metric-pill {{ display:flex; align-items:center; gap:6px; padding:4px 10px; border-radius:4px; border:1px solid; font-size:10px; font-weight:600; }}
.metric-pill.pass {{ background:#f1f8f1; color:#2e7d32; border-color:#c8e6c9; }}
.metric-pill.fail {{ background:#fdf1f1; color:#c62828; border-color:#ffcdd2; }}
.metric-pill.warn {{ background:#fffbf0; color:#a06000; border-color:#ffe082; }}
.pill-name {{ font-family:'JetBrains Mono',monospace; }}
.pill-score {{ font-weight:800; }}
.pair-meta {{ display:flex; gap:16px; flex-wrap:wrap; font-size:11px; color:#6a8aaa; margin-bottom:6px; }}
.meta-item strong {{ font-weight:700; }}
.pass-text {{ color:#2e7d32; }}
.warn-text {{ color:#a06000; }}
.fail-text {{ color:#c62828; }}
.notes {{ font-size:10.5px; color:#8aabcc; font-style:italic; border-top:1px solid #e8f2ff; padding-top:8px; margin-top:8px; }}
footer {{ margin-top:32px; font-size:11px; color:#8aabcc; text-align:center; }}
</style>
</head>
<body>
<header>
<h1>AI Response <span>Validator</span> — Eval Report</h1>
<div class="sub">{title} · Generated {generated}</div>
</header>
<div class="overall">
<div class="stat-card">
<div class="val">{total}</div>
<div class="lbl">Pairs evaluated</div>
</div>
<div class="stat-card">
<div class="val {'pass-text' if pass_rate >= 75 else 'fail-text'}">{pass_rate}%</div>
<div class="lbl">Overall pass rate</div>
</div>
<div class="stat-card">
<div class="val {'pass-text' if avg_cov >= 75 else 'warn-text'}">{avg_cov}%</div>
<div class="lbl">Keyphrase coverage</div>
</div>
</div>
<div class="section-label">Per-metric summary</div>
<div class="summary-metrics">{summary_pills}</div>
<div class="section-label">Pair results</div>
{pair_rows}
<footer>AI Response Validator · eval/metrics.py</footer>
</body>
</html>"""
# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------
def main() -> None:
parser = argparse.ArgumentParser(description="L2 batch evaluation against golden dataset")
parser.add_argument("--domain", help="Filter by domain (retail|pharma)")
parser.add_argument("--client", help="Filter by client id")
parser.add_argument("--out", help="Write JSON results to file")
args = parser.parse_args()
hf_token = os.environ.get("HF_TOKEN")
if not hf_token:
sys.exit("HF_TOKEN not set")
hf_client = InferenceClient(token=hf_token)
pairs = load_pairs(domain=args.domain, client=args.client)
if not pairs:
sys.exit("No pairs matched the given filters")
log.info("Evaluating %d pairs...", len(pairs))
results = []
for i, pair in enumerate(pairs, 1):
log.info("[%d/%d] %s", i, len(pairs), pair["id"])
results.append(score_pair(pair, hf_client))
print_summary(results)
REPORTS_DIR.mkdir(exist_ok=True)
suffix = args.domain or args.client or "all"
html_path = REPORTS_DIR / f"report_{suffix}.html"
html_path.write_text(generate_html(results, args.domain or args.client))
log.info("\nHTML report: %s", html_path)
if args.out:
Path(args.out).write_text(json.dumps(results, indent=2))
log.info("JSON results: %s", args.out)
if __name__ == "__main__":
main()