Spaces:

vxa8502
/

Sage

Running

App Files Files Community

Sage / scripts /summary.py

vxa8502

Improve README accuracy

f3cf4b5 4 days ago

raw

history blame contribute delete

6.89 kB

	"""
	Pipeline results summary.

	Reads the latest evaluation result files and prints a scannable
	pass/fail summary. Designed to run at the end of `make all`.

	Graceful degradation: missing result files produce "(not available)"
	sections rather than errors. This script never exits non-zero so it
	cannot fail the pipeline.

	Usage:
	python scripts/summary.py
	"""

	import json
	import sys
	from pathlib import Path

	from sage.config import (
	EVAL_DIMENSIONS,
	FAITHFULNESS_TARGET,
	HELPFULNESS_TARGET,
	RESULTS_DIR,
	)

	WIDTH = 60
	SEP = "=" * WIDTH


	def load_json(path: Path) -> dict \| None:
	"""Load a JSON file, returning None if missing or malformed."""
	try:
	with open(path, encoding="utf-8") as f:
	return json.load(f)
	except (FileNotFoundError, json.JSONDecodeError):
	return None


	def fmt(value: float \| None, decimals: int = 4) -> str:
	if value is None:
	return " ---"
	return f"{value:.{decimals}f}"


	def print_section(title: str):
	print(f"\n{title}")


	def main():
	print(f"\n{SEP}")
	print("SAGE PIPELINE RESULTS")
	print(SEP)

	# -- Recommendation Quality (Natural Queries) -----------------------------
	nat = load_json(RESULTS_DIR / "eval_natural_queries_latest.json")
	print_section("Recommendation Quality (Natural Queries):")
	if nat and "primary_metrics" in nat:
	m = nat["primary_metrics"]
	print(f" NDCG@10: {fmt(m.get('ndcg_at_10'))}")
	print(f" Hit@10: {fmt(m.get('hit_at_10'))}")
	print(f" MRR: {fmt(m.get('mrr'))}")
	else:
	print(" (not available)")

	# -- Explanation Faithfulness ---------------------------------------------
	faith = load_json(RESULTS_DIR / "faithfulness_latest.json")
	print_section("Explanation Faithfulness:")
	if faith and "hhem" in faith:
	n_samples = faith.get("n_samples", 0)

	# Multi-metric (primary): claim-level HHEM + quote verification
	mm = faith.get("multi_metric", {})
	claim_pass = mm.get("claim_level_pass_rate")
	claim_avg = mm.get("claim_level_avg_score")
	quote_rate = mm.get("quote_verification_rate")
	quotes_found = mm.get("quotes_found", 0)
	quotes_total = mm.get("quotes_total", 0)

	if claim_pass is not None:
	print(
	f" Claim HHEM: {fmt(claim_avg, 3)} ({claim_pass * 100:.0f}% pass)"
	)
	print(
	f" Quote Verif: {fmt(quote_rate, 3)} ({quotes_found}/{quotes_total})"
	)

	# Full-explanation HHEM (reference)
	h = faith["hhem"]
	n_grounded = n_samples - h.get("n_hallucinated", 0)
	full_avg = h.get("mean_score")
	print(
	f" Full HHEM: {fmt(full_avg, 3)} ({n_grounded}/{n_samples} grounded, reference)"
	)

	# RAGAS if available
	ragas = faith.get("ragas", {})
	ragas_faith = ragas.get("faithfulness_mean")
	if ragas_faith is not None:
	print(f" RAGAS Faith: {fmt(ragas_faith, 3)}")

	# Pass/fail: use claim-level as primary, fall back to RAGAS, then full HHEM
	effective = (
	claim_avg
	if claim_avg is not None
	else (ragas_faith if ragas_faith is not None else full_avg)
	)
	if effective is not None:
	status = "PASS" if effective >= FAITHFULNESS_TARGET else "FAIL"
	print(f" Target: {FAITHFULNESS_TARGET:.3f} [{status}]")
	else:
	print(" (not available)")

	# -- Human Evaluation ------------------------------------------------------
	human = load_json(RESULTS_DIR / "human_eval_latest.json")
	print_section("Human Evaluation:")
	if human and "dimensions" in human:
	n = human.get("n_samples", 0)
	dims = human["dimensions"]
	overall = human.get("overall_helpfulness")
	target = human.get("target", HELPFULNESS_TARGET)
	print(f" Samples: {n}")
	for dim_key in EVAL_DIMENSIONS:
	d = dims.get(dim_key, {})
	m = d.get("mean")
	label = dim_key.title()
	print(f" {label + ':':<15s} {fmt(m, 2) if m is not None else ' ---'}")
	if overall is not None:
	status = "PASS" if human.get("pass", False) else "FAIL"
	print(
	f" Helpfulness: {fmt(overall, 2)} (target: {target:.1f}) [{status}]"
	)
	corr = human.get("hhem_trust_correlation", {})
	r = corr.get("spearman_r")
	if r is not None:
	print(f" HHEM-Trust r: {fmt(r, 3)} (p={corr.get('p_value', '?')})")
	else:
	print(" (not available)")

	# -- Grounding Delta -------------------------------------------------------
	delta = load_json(RESULTS_DIR / "grounding_delta_latest.json")
	print_section("Grounding Delta (RAG Impact):")
	if delta:
	with_ev = delta.get("with_evidence_mean")
	without_ev = delta.get("without_evidence_mean")
	d = delta.get("delta")
	n = delta.get("n_samples", 0)
	print(f" With evidence: {fmt(with_ev, 3)}")
	print(f" Without: {fmt(without_ev, 3)}")
	print(f" Delta: {fmt(d, 3)} (+{d * 100:.0f}pp, n={n})")
	else:
	print(" (not available)")

	# -- Refusal Rate ----------------------------------------------------------
	adj = load_json(RESULTS_DIR / "adjusted_faithfulness_latest.json")
	print_section("Quality Gate (Refusals):")
	if adj:
	n_total = adj.get("n_total", 0)
	n_refusals = adj.get("n_refusals", 0)
	rate = n_refusals / n_total if n_total > 0 else 0
	adj_pass = adj.get("adjusted_pass_rate")
	print(f" Refusals: {n_refusals}/{n_total} ({rate * 100:.0f}%)")
	print(f" Adj Pass Rate: {fmt(adj_pass, 3)}")
	else:
	print(" (not available)")

	# -- Load Test -------------------------------------------------------------
	load = load_json(RESULTS_DIR / "load_test_latest.json")
	print_section("Production Latency:")
	if load:
	p50 = load.get("p50_ms")
	p95 = load.get("p95_ms")
	p99 = load.get("p99_ms")
	n = load.get("total_requests", 0)
	hits = load.get("cache_hits", 0)
	hit_rate = hits / n if n > 0 else 0
	print(f" P50: {p50:.0f}ms")
	print(f" P95: {p95:.0f}ms")
	print(f" P99: {p99:.0f}ms (n={n})")
	print(f" Cache hits: {hits}/{n} ({hit_rate * 100:.0f}%)")
	else:
	print(" (not available)")

	# -- Footer ---------------------------------------------------------------
	print(f"\nResults: {RESULTS_DIR}/")
	print(SEP)


	if __name__ == "__main__":
	try:
	main()
	except Exception as exc:
	# Never fail the pipeline
	print(f"\nSummary error: {exc}", file=sys.stderr)