|
|
""" |
|
|
Pipeline results summary. |
|
|
|
|
|
Reads the latest evaluation result files and prints a scannable |
|
|
pass/fail summary. Designed to run at the end of `make all`. |
|
|
|
|
|
Graceful degradation: missing result files produce "(not available)" |
|
|
sections rather than errors. This script never exits non-zero so it |
|
|
cannot fail the pipeline. |
|
|
|
|
|
Usage: |
|
|
python scripts/summary.py |
|
|
""" |
|
|
|
|
|
import json |
|
|
import sys |
|
|
from pathlib import Path |
|
|
|
|
|
from sage.config import ( |
|
|
EVAL_DIMENSIONS, |
|
|
FAITHFULNESS_TARGET, |
|
|
HELPFULNESS_TARGET, |
|
|
RESULTS_DIR, |
|
|
) |
|
|
|
|
|
WIDTH = 60 |
|
|
SEP = "=" * WIDTH |
|
|
|
|
|
|
|
|
def load_json(path: Path) -> dict | None: |
|
|
"""Load a JSON file, returning None if missing or malformed.""" |
|
|
try: |
|
|
with open(path, encoding="utf-8") as f: |
|
|
return json.load(f) |
|
|
except (FileNotFoundError, json.JSONDecodeError): |
|
|
return None |
|
|
|
|
|
|
|
|
def fmt(value: float | None, decimals: int = 4) -> str: |
|
|
if value is None: |
|
|
return " ---" |
|
|
return f"{value:.{decimals}f}" |
|
|
|
|
|
|
|
|
def print_section(title: str): |
|
|
print(f"\n{title}") |
|
|
|
|
|
|
|
|
def main(): |
|
|
print(f"\n{SEP}") |
|
|
print("SAGE PIPELINE RESULTS") |
|
|
print(SEP) |
|
|
|
|
|
|
|
|
nat = load_json(RESULTS_DIR / "eval_natural_queries_latest.json") |
|
|
print_section("Recommendation Quality (Natural Queries):") |
|
|
if nat and "primary_metrics" in nat: |
|
|
m = nat["primary_metrics"] |
|
|
print(f" NDCG@10: {fmt(m.get('ndcg_at_10'))}") |
|
|
print(f" Hit@10: {fmt(m.get('hit_at_10'))}") |
|
|
print(f" MRR: {fmt(m.get('mrr'))}") |
|
|
else: |
|
|
print(" (not available)") |
|
|
|
|
|
|
|
|
faith = load_json(RESULTS_DIR / "faithfulness_latest.json") |
|
|
print_section("Explanation Faithfulness:") |
|
|
if faith and "hhem" in faith: |
|
|
n_samples = faith.get("n_samples", 0) |
|
|
|
|
|
|
|
|
mm = faith.get("multi_metric", {}) |
|
|
claim_pass = mm.get("claim_level_pass_rate") |
|
|
claim_avg = mm.get("claim_level_avg_score") |
|
|
quote_rate = mm.get("quote_verification_rate") |
|
|
quotes_found = mm.get("quotes_found", 0) |
|
|
quotes_total = mm.get("quotes_total", 0) |
|
|
|
|
|
if claim_pass is not None: |
|
|
print( |
|
|
f" Claim HHEM: {fmt(claim_avg, 3)} ({claim_pass * 100:.0f}% pass)" |
|
|
) |
|
|
print( |
|
|
f" Quote Verif: {fmt(quote_rate, 3)} ({quotes_found}/{quotes_total})" |
|
|
) |
|
|
|
|
|
|
|
|
h = faith["hhem"] |
|
|
n_grounded = n_samples - h.get("n_hallucinated", 0) |
|
|
full_avg = h.get("mean_score") |
|
|
print( |
|
|
f" Full HHEM: {fmt(full_avg, 3)} ({n_grounded}/{n_samples} grounded, reference)" |
|
|
) |
|
|
|
|
|
|
|
|
ragas = faith.get("ragas", {}) |
|
|
ragas_faith = ragas.get("faithfulness_mean") |
|
|
if ragas_faith is not None: |
|
|
print(f" RAGAS Faith: {fmt(ragas_faith, 3)}") |
|
|
|
|
|
|
|
|
effective = ( |
|
|
claim_avg |
|
|
if claim_avg is not None |
|
|
else (ragas_faith if ragas_faith is not None else full_avg) |
|
|
) |
|
|
if effective is not None: |
|
|
status = "PASS" if effective >= FAITHFULNESS_TARGET else "FAIL" |
|
|
print(f" Target: {FAITHFULNESS_TARGET:.3f} [{status}]") |
|
|
else: |
|
|
print(" (not available)") |
|
|
|
|
|
|
|
|
human = load_json(RESULTS_DIR / "human_eval_latest.json") |
|
|
print_section("Human Evaluation:") |
|
|
if human and "dimensions" in human: |
|
|
n = human.get("n_samples", 0) |
|
|
dims = human["dimensions"] |
|
|
overall = human.get("overall_helpfulness") |
|
|
target = human.get("target", HELPFULNESS_TARGET) |
|
|
print(f" Samples: {n}") |
|
|
for dim_key in EVAL_DIMENSIONS: |
|
|
d = dims.get(dim_key, {}) |
|
|
m = d.get("mean") |
|
|
label = dim_key.title() |
|
|
print(f" {label + ':':<15s} {fmt(m, 2) if m is not None else ' ---'}") |
|
|
if overall is not None: |
|
|
status = "PASS" if human.get("pass", False) else "FAIL" |
|
|
print( |
|
|
f" Helpfulness: {fmt(overall, 2)} (target: {target:.1f}) [{status}]" |
|
|
) |
|
|
corr = human.get("hhem_trust_correlation", {}) |
|
|
r = corr.get("spearman_r") |
|
|
if r is not None: |
|
|
print(f" HHEM-Trust r: {fmt(r, 3)} (p={corr.get('p_value', '?')})") |
|
|
else: |
|
|
print(" (not available)") |
|
|
|
|
|
|
|
|
delta = load_json(RESULTS_DIR / "grounding_delta_latest.json") |
|
|
print_section("Grounding Delta (RAG Impact):") |
|
|
if delta: |
|
|
with_ev = delta.get("with_evidence_mean") |
|
|
without_ev = delta.get("without_evidence_mean") |
|
|
d = delta.get("delta") |
|
|
n = delta.get("n_samples", 0) |
|
|
print(f" With evidence: {fmt(with_ev, 3)}") |
|
|
print(f" Without: {fmt(without_ev, 3)}") |
|
|
print(f" Delta: {fmt(d, 3)} (+{d * 100:.0f}pp, n={n})") |
|
|
else: |
|
|
print(" (not available)") |
|
|
|
|
|
|
|
|
adj = load_json(RESULTS_DIR / "adjusted_faithfulness_latest.json") |
|
|
print_section("Quality Gate (Refusals):") |
|
|
if adj: |
|
|
n_total = adj.get("n_total", 0) |
|
|
n_refusals = adj.get("n_refusals", 0) |
|
|
rate = n_refusals / n_total if n_total > 0 else 0 |
|
|
adj_pass = adj.get("adjusted_pass_rate") |
|
|
print(f" Refusals: {n_refusals}/{n_total} ({rate * 100:.0f}%)") |
|
|
print(f" Adj Pass Rate: {fmt(adj_pass, 3)}") |
|
|
else: |
|
|
print(" (not available)") |
|
|
|
|
|
|
|
|
load = load_json(RESULTS_DIR / "load_test_latest.json") |
|
|
print_section("Production Latency:") |
|
|
if load: |
|
|
p50 = load.get("p50_ms") |
|
|
p95 = load.get("p95_ms") |
|
|
p99 = load.get("p99_ms") |
|
|
n = load.get("total_requests", 0) |
|
|
hits = load.get("cache_hits", 0) |
|
|
hit_rate = hits / n if n > 0 else 0 |
|
|
print(f" P50: {p50:.0f}ms") |
|
|
print(f" P95: {p95:.0f}ms") |
|
|
print(f" P99: {p99:.0f}ms (n={n})") |
|
|
print(f" Cache hits: {hits}/{n} ({hit_rate * 100:.0f}%)") |
|
|
else: |
|
|
print(" (not available)") |
|
|
|
|
|
|
|
|
print(f"\nResults: {RESULTS_DIR}/") |
|
|
print(SEP) |
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
try: |
|
|
main() |
|
|
except Exception as exc: |
|
|
|
|
|
print(f"\nSummary error: {exc}", file=sys.stderr) |
|
|
|