Spaces:

vxa8502
/

Sage

Running

File size: 6,891 Bytes

"""
Pipeline results summary.

Reads the latest evaluation result files and prints a scannable
pass/fail summary. Designed to run at the end of `make all`.

Graceful degradation: missing result files produce "(not available)"
sections rather than errors. This script never exits non-zero so it
cannot fail the pipeline.

Usage:
    python scripts/summary.py
"""

import json
import sys
from pathlib import Path

from sage.config import (
    EVAL_DIMENSIONS,
    FAITHFULNESS_TARGET,
    HELPFULNESS_TARGET,
    RESULTS_DIR,
)

WIDTH = 60
SEP = "=" * WIDTH


def load_json(path: Path) -> dict | None:
    """Load a JSON file, returning None if missing or malformed."""
    try:
        with open(path, encoding="utf-8") as f:
            return json.load(f)
    except (FileNotFoundError, json.JSONDecodeError):
        return None


def fmt(value: float | None, decimals: int = 4) -> str:
    if value is None:
        return "   ---"
    return f"{value:.{decimals}f}"


def print_section(title: str):
    print(f"\n{title}")


def main():
    print(f"\n{SEP}")
    print("SAGE PIPELINE RESULTS")
    print(SEP)

    # -- Recommendation Quality (Natural Queries) -----------------------------
    nat = load_json(RESULTS_DIR / "eval_natural_queries_latest.json")
    print_section("Recommendation Quality (Natural Queries):")
    if nat and "primary_metrics" in nat:
        m = nat["primary_metrics"]
        print(f"  NDCG@10:    {fmt(m.get('ndcg_at_10'))}")
        print(f"  Hit@10:     {fmt(m.get('hit_at_10'))}")
        print(f"  MRR:        {fmt(m.get('mrr'))}")
    else:
        print("  (not available)")

    # -- Explanation Faithfulness ---------------------------------------------
    faith = load_json(RESULTS_DIR / "faithfulness_latest.json")
    print_section("Explanation Faithfulness:")
    if faith and "hhem" in faith:
        n_samples = faith.get("n_samples", 0)

        # Multi-metric (primary): claim-level HHEM + quote verification
        mm = faith.get("multi_metric", {})
        claim_pass = mm.get("claim_level_pass_rate")
        claim_avg = mm.get("claim_level_avg_score")
        quote_rate = mm.get("quote_verification_rate")
        quotes_found = mm.get("quotes_found", 0)
        quotes_total = mm.get("quotes_total", 0)

        if claim_pass is not None:
            print(
                f"  Claim HHEM:     {fmt(claim_avg, 3)}  ({claim_pass * 100:.0f}% pass)"
            )
            print(
                f"  Quote Verif:    {fmt(quote_rate, 3)}  ({quotes_found}/{quotes_total})"
            )

        # Full-explanation HHEM (reference)
        h = faith["hhem"]
        n_grounded = n_samples - h.get("n_hallucinated", 0)
        full_avg = h.get("mean_score")
        print(
            f"  Full HHEM:      {fmt(full_avg, 3)}  ({n_grounded}/{n_samples} grounded, reference)"
        )

        # RAGAS if available
        ragas = faith.get("ragas", {})
        ragas_faith = ragas.get("faithfulness_mean")
        if ragas_faith is not None:
            print(f"  RAGAS Faith:    {fmt(ragas_faith, 3)}")

        # Pass/fail: use claim-level as primary, fall back to RAGAS, then full HHEM
        effective = (
            claim_avg
            if claim_avg is not None
            else (ragas_faith if ragas_faith is not None else full_avg)
        )
        if effective is not None:
            status = "PASS" if effective >= FAITHFULNESS_TARGET else "FAIL"
            print(f"  Target:         {FAITHFULNESS_TARGET:.3f}  [{status}]")
    else:
        print("  (not available)")

    # -- Human Evaluation ------------------------------------------------------
    human = load_json(RESULTS_DIR / "human_eval_latest.json")
    print_section("Human Evaluation:")
    if human and "dimensions" in human:
        n = human.get("n_samples", 0)
        dims = human["dimensions"]
        overall = human.get("overall_helpfulness")
        target = human.get("target", HELPFULNESS_TARGET)
        print(f"  Samples:        {n}")
        for dim_key in EVAL_DIMENSIONS:
            d = dims.get(dim_key, {})
            m = d.get("mean")
            label = dim_key.title()
            print(f"  {label + ':':<15s} {fmt(m, 2) if m is not None else '   ---'}")
        if overall is not None:
            status = "PASS" if human.get("pass", False) else "FAIL"
            print(
                f"  Helpfulness:    {fmt(overall, 2)}  (target: {target:.1f})  [{status}]"
            )
        corr = human.get("hhem_trust_correlation", {})
        r = corr.get("spearman_r")
        if r is not None:
            print(f"  HHEM-Trust r:   {fmt(r, 3)}  (p={corr.get('p_value', '?')})")
    else:
        print("  (not available)")

    # -- Grounding Delta -------------------------------------------------------
    delta = load_json(RESULTS_DIR / "grounding_delta_latest.json")
    print_section("Grounding Delta (RAG Impact):")
    if delta:
        with_ev = delta.get("with_evidence_mean")
        without_ev = delta.get("without_evidence_mean")
        d = delta.get("delta")
        n = delta.get("n_samples", 0)
        print(f"  With evidence:  {fmt(with_ev, 3)}")
        print(f"  Without:        {fmt(without_ev, 3)}")
        print(f"  Delta:          {fmt(d, 3)}  (+{d * 100:.0f}pp, n={n})")
    else:
        print("  (not available)")

    # -- Refusal Rate ----------------------------------------------------------
    adj = load_json(RESULTS_DIR / "adjusted_faithfulness_latest.json")
    print_section("Quality Gate (Refusals):")
    if adj:
        n_total = adj.get("n_total", 0)
        n_refusals = adj.get("n_refusals", 0)
        rate = n_refusals / n_total if n_total > 0 else 0
        adj_pass = adj.get("adjusted_pass_rate")
        print(f"  Refusals:       {n_refusals}/{n_total}  ({rate * 100:.0f}%)")
        print(f"  Adj Pass Rate:  {fmt(adj_pass, 3)}")
    else:
        print("  (not available)")

    # -- Load Test -------------------------------------------------------------
    load = load_json(RESULTS_DIR / "load_test_latest.json")
    print_section("Production Latency:")
    if load:
        p50 = load.get("p50_ms")
        p95 = load.get("p95_ms")
        p99 = load.get("p99_ms")
        n = load.get("total_requests", 0)
        hits = load.get("cache_hits", 0)
        hit_rate = hits / n if n > 0 else 0
        print(f"  P50:            {p50:.0f}ms")
        print(f"  P95:            {p95:.0f}ms")
        print(f"  P99:            {p99:.0f}ms  (n={n})")
        print(f"  Cache hits:     {hits}/{n}  ({hit_rate * 100:.0f}%)")
    else:
        print("  (not available)")

    # -- Footer ---------------------------------------------------------------
    print(f"\nResults: {RESULTS_DIR}/")
    print(SEP)


if __name__ == "__main__":
    try:
        main()
    except Exception as exc:
        # Never fail the pipeline
        print(f"\nSummary error: {exc}", file=sys.stderr)