""" Pipeline results summary. Reads the latest evaluation result files and prints a scannable pass/fail summary. Designed to run at the end of `make all`. Graceful degradation: missing result files produce "(not available)" sections rather than errors. This script never exits non-zero so it cannot fail the pipeline. Usage: python scripts/summary.py """ import json import sys from pathlib import Path from sage.config import ( EVAL_DIMENSIONS, FAITHFULNESS_TARGET, HELPFULNESS_TARGET, RESULTS_DIR, ) WIDTH = 60 SEP = "=" * WIDTH def load_json(path: Path) -> dict | None: """Load a JSON file, returning None if missing or malformed.""" try: with open(path, encoding="utf-8") as f: return json.load(f) except (FileNotFoundError, json.JSONDecodeError): return None def fmt(value: float | None, decimals: int = 4) -> str: if value is None: return " ---" return f"{value:.{decimals}f}" def print_section(title: str): print(f"\n{title}") def main(): print(f"\n{SEP}") print("SAGE PIPELINE RESULTS") print(SEP) # -- Recommendation Quality (Natural Queries) ----------------------------- nat = load_json(RESULTS_DIR / "eval_natural_queries_latest.json") print_section("Recommendation Quality (Natural Queries):") if nat and "primary_metrics" in nat: m = nat["primary_metrics"] print(f" NDCG@10: {fmt(m.get('ndcg_at_10'))}") print(f" Hit@10: {fmt(m.get('hit_at_10'))}") print(f" MRR: {fmt(m.get('mrr'))}") else: print(" (not available)") # -- Explanation Faithfulness --------------------------------------------- faith = load_json(RESULTS_DIR / "faithfulness_latest.json") print_section("Explanation Faithfulness:") if faith and "hhem" in faith: n_samples = faith.get("n_samples", 0) # Multi-metric (primary): claim-level HHEM + quote verification mm = faith.get("multi_metric", {}) claim_pass = mm.get("claim_level_pass_rate") claim_avg = mm.get("claim_level_avg_score") quote_rate = mm.get("quote_verification_rate") quotes_found = mm.get("quotes_found", 0) quotes_total = mm.get("quotes_total", 0) if claim_pass is not None: print( f" Claim HHEM: {fmt(claim_avg, 3)} ({claim_pass * 100:.0f}% pass)" ) print( f" Quote Verif: {fmt(quote_rate, 3)} ({quotes_found}/{quotes_total})" ) # Full-explanation HHEM (reference) h = faith["hhem"] n_grounded = n_samples - h.get("n_hallucinated", 0) full_avg = h.get("mean_score") print( f" Full HHEM: {fmt(full_avg, 3)} ({n_grounded}/{n_samples} grounded, reference)" ) # RAGAS if available ragas = faith.get("ragas", {}) ragas_faith = ragas.get("faithfulness_mean") if ragas_faith is not None: print(f" RAGAS Faith: {fmt(ragas_faith, 3)}") # Pass/fail: use claim-level as primary, fall back to RAGAS, then full HHEM effective = ( claim_avg if claim_avg is not None else (ragas_faith if ragas_faith is not None else full_avg) ) if effective is not None: status = "PASS" if effective >= FAITHFULNESS_TARGET else "FAIL" print(f" Target: {FAITHFULNESS_TARGET:.3f} [{status}]") else: print(" (not available)") # -- Human Evaluation ------------------------------------------------------ human = load_json(RESULTS_DIR / "human_eval_latest.json") print_section("Human Evaluation:") if human and "dimensions" in human: n = human.get("n_samples", 0) dims = human["dimensions"] overall = human.get("overall_helpfulness") target = human.get("target", HELPFULNESS_TARGET) print(f" Samples: {n}") for dim_key in EVAL_DIMENSIONS: d = dims.get(dim_key, {}) m = d.get("mean") label = dim_key.title() print(f" {label + ':':<15s} {fmt(m, 2) if m is not None else ' ---'}") if overall is not None: status = "PASS" if human.get("pass", False) else "FAIL" print( f" Helpfulness: {fmt(overall, 2)} (target: {target:.1f}) [{status}]" ) corr = human.get("hhem_trust_correlation", {}) r = corr.get("spearman_r") if r is not None: print(f" HHEM-Trust r: {fmt(r, 3)} (p={corr.get('p_value', '?')})") else: print(" (not available)") # -- Grounding Delta ------------------------------------------------------- delta = load_json(RESULTS_DIR / "grounding_delta_latest.json") print_section("Grounding Delta (RAG Impact):") if delta: with_ev = delta.get("with_evidence_mean") without_ev = delta.get("without_evidence_mean") d = delta.get("delta") n = delta.get("n_samples", 0) print(f" With evidence: {fmt(with_ev, 3)}") print(f" Without: {fmt(without_ev, 3)}") print(f" Delta: {fmt(d, 3)} (+{d * 100:.0f}pp, n={n})") else: print(" (not available)") # -- Refusal Rate ---------------------------------------------------------- adj = load_json(RESULTS_DIR / "adjusted_faithfulness_latest.json") print_section("Quality Gate (Refusals):") if adj: n_total = adj.get("n_total", 0) n_refusals = adj.get("n_refusals", 0) rate = n_refusals / n_total if n_total > 0 else 0 adj_pass = adj.get("adjusted_pass_rate") print(f" Refusals: {n_refusals}/{n_total} ({rate * 100:.0f}%)") print(f" Adj Pass Rate: {fmt(adj_pass, 3)}") else: print(" (not available)") # -- Load Test ------------------------------------------------------------- load = load_json(RESULTS_DIR / "load_test_latest.json") print_section("Production Latency:") if load: p50 = load.get("p50_ms") p95 = load.get("p95_ms") p99 = load.get("p99_ms") n = load.get("total_requests", 0) hits = load.get("cache_hits", 0) hit_rate = hits / n if n > 0 else 0 print(f" P50: {p50:.0f}ms") print(f" P95: {p95:.0f}ms") print(f" P99: {p99:.0f}ms (n={n})") print(f" Cache hits: {hits}/{n} ({hit_rate * 100:.0f}%)") else: print(" (not available)") # -- Footer --------------------------------------------------------------- print(f"\nResults: {RESULTS_DIR}/") print(SEP) if __name__ == "__main__": try: main() except Exception as exc: # Never fail the pipeline print(f"\nSummary error: {exc}", file=sys.stderr)