Spaces:

Meta-HF-hackathon
/

updated-policy

Sleeping

App Files Files Community

updated-policy / training /report.py

srinjoyd

init

19f7f7b about 1 month ago

raw

history blame contribute delete

10.9 kB

	"""
	Report generator — turns `ablations.json` into the four paper tables
	and four behavioral metric plots.

	Tables (markdown — copy/pasteable into a paper or report):

	Table 1: Orchestrator vs fixed thresholds (Claim 1)
	Table 2: r_cross on/off (Claim 2)
	Table 3: Stage 2+3 only vs full Stage 4 (Claim 3)
	Table 4: Pool-D held-out generalization (Claim 4)

	Plots (matplotlib, optional — module is usable without matplotlib too):

	Plot 1: Stopping-distribution histogram per condition (Claim 1)
	Plot 2: P2 steps to correct patch — bar plot (Claim 2)
	Plot 3: Cumulative running mean (convergence proxy) (Claim 3)
	Plot 4: Confidence calibration curve, trained vs PE baseline (Claim 4)

	CLI:
	python -m incident_env.training.report \
	--input ablation_results.json --out report/

	Outputs:
	report/tables.md
	report/stopping_distribution.png
	report/p2_steps_to_correct.png
	report/convergence.png
	report/calibration.png
	"""

	from __future__ import annotations

	import argparse
	import json
	from pathlib import Path
	from typing import Any, Dict, List, Optional


	# ──────────────────────────────────────────────────────────────────────
	# Tables
	# ──────────────────────────────────────────────────────────────────────


	def _md_row(cells: List[Any]) -> str:
	return "\| " + " \| ".join(str(c) for c in cells) + " \|"


	def table_claim1(claim1: Dict[str, Any]) -> str:
	"""Aggregate scores + KL of stopping distribution + interleave position."""
	agg = claim1["aggregate"]
	behav = claim1.get("behavioral", {})
	kl_pairs = behav.get("stopping_distribution", {}).get("kl_pairwise", {})
	pos = behav.get("action_position[check_dependencies]", {})
	rows = [
	"## Table 1 — Orchestrator vs fixed thresholds (Claim 1)",
	"",
	_md_row(["condition", "n", "mean_final", "mean_p1_steps", "mean_p2_steps",
	"check_deps_position (median)"]),
	_md_row(["---"] * 6),
	]
	for name, a in agg.items():
	rows.append(_md_row([
	name, a["n"], a["mean_final"], a["mean_p1_steps"], a["mean_p2_steps"],
	pos.get(name, {}).get("median_position", "—"),
	]))
	if kl_pairs:
	rows.append("")
	rows.append("Pairwise KL between stopping-length distributions:")
	rows.append("")
	for k, v in kl_pairs.items():
	rows.append(f"- `{k}` → {v}")
	return "\n".join(rows)


	def table_claim2(claim2: Dict[str, Any]) -> str:
	agg = claim2["aggregate"]
	rows = [
	"## Table 2 — r_cross ablation (Claim 2)",
	"",
	_md_row(["condition", "n", "mean_final", "mean_r_cross",
	"mean_p2_steps", "p2_steps_to_correct_patch"]),
	_md_row(["---"] * 6),
	]
	for name, a in agg.items():
	rows.append(_md_row([
	name, a["n"], a["mean_final"], a["mean_r_cross"],
	a["mean_p2_steps"], a.get("p2_steps_to_correct_patch", "—"),
	]))
	return "\n".join(rows)


	def table_claim3(claim3: Dict[str, Any]) -> str:
	agg = claim3["aggregate"]
	curves = claim3.get("convergence_curve", {})
	rows = [
	"## Table 3 — Stage 2+3 only vs Full Stage 4 (Claim 3)",
	"",
	_md_row(["condition", "n", "mean_final", "stdev_final", "mean_p1_steps"]),
	_md_row(["---"] * 5),
	]
	for name, a in agg.items():
	rows.append(_md_row([
	name, a["n"], a["mean_final"], a["stdev_final"], a["mean_p1_steps"],
	]))
	if curves:
	rows.append("")
	rows.append("Cumulative running-mean curves (early-vs-late convergence proxy):")
	rows.append("")
	for name, vals in curves.items():
	rows.append(f"- `{name}` → {vals}")
	return "\n".join(rows)


	def table_claim4(claim4: Dict[str, Any]) -> str:
	agg = claim4["aggregate"]
	behav = claim4.get("behavioral", {})
	cal = behav.get("confidence_calibration", {})
	rows = [
	"## Table 4 — Pool-D held-out generalization (Claim 4)",
	"",
	_md_row(["condition", "n", "mean_final", "stdev_final", "ECE"]),
	_md_row(["---"] * 5),
	]
	for name, a in agg.items():
	ece = cal.get(name, {}).get("ece", "—")
	rows.append(_md_row([name, a["n"], a["mean_final"], a["stdev_final"], ece]))
	return "\n".join(rows)


	def render_tables(report: Dict[str, Any]) -> str:
	parts = []
	if "claim1" in report:
	parts.append(table_claim1(report["claim1"]))
	if "claim2" in report:
	parts.append(table_claim2(report["claim2"]))
	if "claim3" in report:
	parts.append(table_claim3(report["claim3"]))
	if "claim4" in report:
	parts.append(table_claim4(report["claim4"]))
	return "\n\n".join(parts) + "\n"


	# ──────────────────────────────────────────────────────────────────────
	# Plots (optional — matplotlib import gated)
	# ──────────────────────────────────────────────────────────────────────


	def _try_matplotlib():
	try:
	import matplotlib
	matplotlib.use("Agg")
	import matplotlib.pyplot as plt # noqa
	return plt
	except ImportError:
	return None


	def plot_stopping_distribution(claim1: Dict[str, Any], out: Path) -> Optional[Path]:
	plt = _try_matplotlib()
	if plt is None:
	return None
	sd = claim1.get("behavioral", {}).get("stopping_distribution", {})
	cond_dists = {k: v for k, v in sd.items() if k != "kl_pairwise"}
	if not cond_dists:
	return None
	buckets = list(next(iter(cond_dists.values())).keys())
	fig, ax = plt.subplots(figsize=(8, 4))
	width = 0.8 / max(len(cond_dists), 1)
	for i, (name, dist) in enumerate(cond_dists.items()):
	ys = [dist.get(b, 0.0) for b in buckets]
	xs = [j + i * width for j in range(len(buckets))]
	ax.bar(xs, ys, width=width, label=name)
	ax.set_xticks([j + 0.4 for j in range(len(buckets))])
	ax.set_xticklabels(buckets, rotation=30, ha="right")
	ax.set_ylabel("Probability")
	ax.set_title("Phase-1 length distribution per condition (Claim 1)")
	ax.legend()
	fig.tight_layout()
	fig.savefig(out)
	plt.close(fig)
	return out


	def plot_p2_steps_bar(claim2: Dict[str, Any], out: Path) -> Optional[Path]:
	plt = _try_matplotlib()
	if plt is None:
	return None
	agg = claim2["aggregate"]
	names = list(agg.keys())
	vals = [agg[n].get("p2_steps_to_correct_patch", 0.0) for n in names]
	fig, ax = plt.subplots(figsize=(5, 4))
	ax.bar(names, vals, color=["#3a7", "#a73"])
	ax.set_ylabel("Mean P2 steps to correct patch")
	ax.set_title("Claim 2 — r_cross reduces P2 effort")
	for i, v in enumerate(vals):
	ax.text(i, v + 0.05, f"{v:.1f}", ha="center")
	fig.tight_layout()
	fig.savefig(out)
	plt.close(fig)
	return out


	def plot_convergence(claim3: Dict[str, Any], out: Path) -> Optional[Path]:
	plt = _try_matplotlib()
	if plt is None:
	return None
	curves = claim3.get("convergence_curve", {})
	if not curves:
	return None
	fig, ax = plt.subplots(figsize=(6, 4))
	for name, ys in curves.items():
	ax.plot(range(len(ys)), ys, marker="o", label=name)
	ax.set_xlabel("rollout block (4 episodes each)")
	ax.set_ylabel("running mean(final score)")
	ax.set_title("Claim 3 — convergence curves")
	ax.legend()
	fig.tight_layout()
	fig.savefig(out)
	plt.close(fig)
	return out


	def plot_calibration(claim4: Dict[str, Any], out: Path) -> Optional[Path]:
	plt = _try_matplotlib()
	if plt is None:
	return None
	cal = claim4.get("behavioral", {}).get("confidence_calibration", {})
	if not cal:
	return None
	fig, ax = plt.subplots(figsize=(5, 5))
	for name, c in cal.items():
	xs = [b["mean_conf"] for b in c.get("buckets", []) if b.get("n")]
	ys = [b["accuracy"] for b in c.get("buckets", []) if b.get("n")]
	if not xs:
	continue
	ax.plot(xs, ys, marker="o", label=f"{name} (ECE={c.get('ece', 0):.3f})")
	ax.plot([0, 1], [0, 1], color="gray", linestyle="--", label="ideal")
	ax.set_xlabel("Declared confidence")
	ax.set_ylabel("Empirical accuracy")
	ax.set_title("Claim 4 — calibration on held-out (Pool D)")
	ax.set_xlim(0, 1); ax.set_ylim(0, 1)
	ax.legend()
	fig.tight_layout()
	fig.savefig(out)
	plt.close(fig)
	return out


	# ──────────────────────────────────────────────────────────────────────
	# Top-level
	# ──────────────────────────────────────────────────────────────────────


	def render(report: Dict[str, Any], outdir: Path) -> Dict[str, Any]:
	"""Render tables + (best-effort) plots into outdir. Returns manifest."""
	outdir.mkdir(parents=True, exist_ok=True)
	manifest: Dict[str, Any] = {"outdir": str(outdir), "files": {}}

	# Tables
	md = render_tables(report)
	table_path = outdir / "tables.md"
	table_path.write_text(md)
	manifest["files"]["tables"] = str(table_path)

	# Plots
	plot_jobs = [
	("stopping_distribution.png", "claim1", plot_stopping_distribution),
	("p2_steps_to_correct.png", "claim2", plot_p2_steps_bar),
	("convergence.png", "claim3", plot_convergence),
	("calibration.png", "claim4", plot_calibration),
	]
	for fname, claim_key, fn in plot_jobs:
	if claim_key not in report:
	continue
	out = outdir / fname
	result = fn(report[claim_key], out)
	if result:
	manifest["files"][fname] = str(out)

	return manifest


	def main() -> None:
	parser = argparse.ArgumentParser(description=__doc__)
	parser.add_argument("--input", type=Path, default=Path("ablation_results.json"))
	parser.add_argument("--out", type=Path, default=Path("report"))
	args = parser.parse_args()

	if not args.input.exists():
	raise SystemExit(f"Input file not found: {args.input}")
	report = json.loads(args.input.read_text())
	manifest = render(report, args.out)
	print(json.dumps(manifest, indent=2))


	if __name__ == "__main__":
	main()