#!/usr/bin/env python3 from __future__ import annotations import argparse import json import math from pathlib import Path from typing import Any, Dict, List, Tuple def _load_jsonl(path: Path) -> List[Dict[str, Any]]: if not path.exists(): return [] return [json.loads(line) for line in path.read_text().splitlines() if line.strip()] def _mean_std(values: List[float]) -> Tuple[float, float]: if not values: return 0.0, 0.0 mean = sum(values) / len(values) if len(values) < 2: return mean, 0.0 var = sum((v - mean) ** 2 for v in values) / (len(values) - 1) return mean, math.sqrt(var) def summarize(paths: List[Path]) -> Dict[str, Any]: summary: Dict[str, Any] = {} for path in paths: rows = _load_jsonl(path) if not rows: continue model = rows[0].get("model", "unknown") tier = "unknown" if "trivial" in path.name: tier = "trivial" elif "easy" in path.name: tier = "easy" elif "standard" in path.name: tier = "standard" rewards = [r.get("reward", 0.0) for r in rows] steps = [r.get("step_count", 0) for r in rows] submitted = [r.get("submitted_report", False) for r in rows] reward_mean, reward_std = _mean_std(rewards) step_mean, step_std = _mean_std([float(s) for s in steps]) diag = [r.get("diagnostics", {}) for r in rows] evidence_seen = [d.get("evidence_seen_count", 0) for d in diag] evidence_content = [d.get("evidence_content_count", 0) for d in diag] containment_attempted = [d.get("containment_attempted", False) for d in diag] key = f"{model}|{tier}" summary[key] = { "model": model, "tier": tier, "runs": len(rows), "reward_mean": reward_mean, "reward_std": reward_std, "reward_min": min(rewards), "reward_max": max(rewards), "step_mean": step_mean, "step_std": step_std, "step_min": min(steps), "step_max": max(steps), "report_submitted_rate": sum(1 for s in submitted if s) / len(rows), "evidence_seen_mean": sum(evidence_seen) / len(rows), "evidence_content_mean": sum(evidence_content) / len(rows), "containment_attempted_rate": sum(1 for c in containment_attempted if c) / len(rows), "source_file": str(path), } return summary def main() -> int: parser = argparse.ArgumentParser() parser.add_argument("--glob", default="outputs/grid_*.jsonl") parser.add_argument("--output", default="outputs/baseline_grid_summary.json") args = parser.parse_args() paths = sorted(Path(".").glob(args.glob)) summary = summarize(paths) out_path = Path(args.output) out_path.write_text(json.dumps(summary, indent=2)) print(f"OK: wrote {out_path}") return 0 if __name__ == "__main__": raise SystemExit(main())