| |
| """Compare two experiment runs across multiple problems. |
| |
| Usage: |
| # New layout: two run directories, each containing p0/, p1/, ... |
| python tasks/frontier_cs_entry/compare_experiments.py \ |
| results/frontier_cs_algorithmic/batch_g50_20260327_120000 \ |
| results/frontier_cs_algorithmic/agent_g50_20260327_130000 |
| |
| # Legacy layout: shared root + variant name patterns |
| python tasks/frontier_cs_entry/compare_experiments.py \ |
| results/frontier_cs_algorithmic batch agent --legacy |
| |
| # Options work with both layouts |
| python tasks/frontier_cs_entry/compare_experiments.py dir_a dir_b --problems p0 p1 p2 |
| python tasks/frontier_cs_entry/compare_experiments.py dir_a dir_b --csv results/comparison.csv |
| """ |
|
|
| import argparse |
| import json |
| import os |
| import re |
| import sqlite3 |
| import sys |
| from collections import defaultdict |
| from pathlib import Path |
|
|
|
|
| def find_experiment_dirs_new(run_dir: str) -> dict[str, Path]: |
| """Find problem directories in a run directory (new layout: run_dir/p0/, p1/, ...). |
| |
| Returns dict mapping problem_id -> directory path. |
| """ |
| run_path = Path(run_dir) |
| if not run_path.exists(): |
| print(f"Error: {run_dir} does not exist", file=sys.stderr) |
| sys.exit(1) |
|
|
| problem_dirs = {} |
| for d in sorted(run_path.iterdir()): |
| if not d.is_dir(): |
| continue |
| match = re.match(r"(p\d+)$", d.name) |
| if match: |
| pid = match.group(1) |
| problem_dirs[pid] = d |
| return problem_dirs |
|
|
|
|
| def find_experiment_dirs_legacy(results_dir: str, variant: str) -> dict[str, Path]: |
| """Find experiment directories in legacy flat layout. |
| |
| Pattern: results_dir/p{id}_{variant}_g{gens}_{timestamp}/ |
| Returns dict mapping problem_id -> directory path. |
| """ |
| results_path = Path(results_dir) |
| if not results_path.exists(): |
| print(f"Error: {results_dir} does not exist", file=sys.stderr) |
| sys.exit(1) |
|
|
| problem_dirs = {} |
| for d in sorted(results_path.iterdir()): |
| if not d.is_dir(): |
| continue |
| match = re.match(r"(p\d+)_(" + re.escape(variant) + r")_g\d+_\d+", d.name) |
| if match: |
| pid = match.group(1) |
| |
| problem_dirs[pid] = d |
| return problem_dirs |
|
|
|
|
| def get_scores_from_db(db_path: Path) -> dict: |
| """Extract score statistics from an evolution_db.sqlite. |
| |
| Uses score_bounded from public_metrics when available (to avoid |
| unbounded scores >100 inflating comparisons). Falls back to |
| combined_score if public_metrics is missing or unparseable. |
| """ |
| if not db_path.exists(): |
| return None |
|
|
| try: |
| conn = sqlite3.connect(str(db_path)) |
| cur = conn.cursor() |
|
|
| |
| cur.execute( |
| "SELECT generation, combined_score, public_metrics FROM programs " |
| "ORDER BY generation" |
| ) |
| all_rows = cur.fetchall() |
|
|
| def _extract_score(combined_score, public_metrics_json): |
| """Return bounded score if available, else combined_score.""" |
| if public_metrics_json: |
| try: |
| pm = json.loads(public_metrics_json) |
| if "score_bounded" in pm: |
| return pm["score_bounded"] |
| except (json.JSONDecodeError, TypeError): |
| pass |
| return combined_score or 0.0 |
|
|
| |
| best_score = 0.0 |
| best_gen = None |
| for gen, cs, pm in all_rows: |
| s = _extract_score(cs, pm) |
| if s > best_score: |
| best_score = s |
| best_gen = gen |
|
|
| |
| max_gen = max((r[0] for r in all_rows), default=0) |
| total_programs = len(all_rows) |
|
|
| |
| gen_best = {} |
| for gen, cs, pm in all_rows: |
| s = _extract_score(cs, pm) |
| if gen not in gen_best or s > gen_best[gen]: |
| gen_best[gen] = s |
| gen_scores = sorted(gen_best.items()) |
|
|
| |
| cum_best = [] |
| running_max = float("-inf") |
| for gen, score in gen_scores: |
| if score is not None and score > running_max: |
| running_max = score |
| cum_best.append((gen, running_max)) |
|
|
| |
| milestones = {} |
| for gen, cb in cum_best: |
| for m in [10, 20, 30, 40, 50]: |
| if gen == m: |
| milestones[m] = cb |
|
|
| conn.close() |
|
|
| return { |
| "best_score": best_score or 0.0, |
| "best_gen": best_gen, |
| "max_gen": max_gen, |
| "total_programs": total_programs, |
| "milestones": milestones, |
| "trajectory": cum_best, |
| } |
| except Exception as e: |
| return {"error": str(e)} |
|
|
|
|
| def format_score(score): |
| if score is None or score == 0: |
| return " 0.00" |
| return f"{score:6.2f}" |
|
|
|
|
| def main(): |
| parser = argparse.ArgumentParser( |
| description="Compare two experiment runs across problems" |
| ) |
| parser.add_argument("dir_a", help="First run directory (or legacy results_dir)") |
| parser.add_argument("dir_b", help="Second run directory (or legacy variant_b name)") |
| parser.add_argument( |
| "--legacy", action="store_true", |
| help="Legacy mode: dir_a is shared results root, dir_b is variant_b name. " |
| "Requires --variant-a.", |
| ) |
| parser.add_argument("--variant-a", type=str, default=None, help="Legacy: variant_a name") |
| parser.add_argument( |
| "--problems", nargs="*", help="Only compare these problem IDs (e.g., p0 p1)" |
| ) |
| parser.add_argument("--csv", help="Export results to CSV file") |
| parser.add_argument( |
| "--sort", |
| choices=["problem", "diff", "score_a", "score_b"], |
| default="problem", |
| help="Sort order for output table", |
| ) |
| args = parser.parse_args() |
|
|
| |
| if args.legacy: |
| va = args.variant_a or "batch" |
| vb = args.dir_b |
| dirs_a = find_experiment_dirs_legacy(args.dir_a, va) |
| dirs_b = find_experiment_dirs_legacy(args.dir_a, vb) |
| else: |
| dirs_a = find_experiment_dirs_new(args.dir_a) |
| dirs_b = find_experiment_dirs_new(args.dir_b) |
|
|
| all_problems = sorted( |
| set(dirs_a.keys()) | set(dirs_b.keys()), |
| key=lambda x: int(x[1:]), |
| ) |
|
|
| if args.problems: |
| all_problems = [p for p in all_problems if p in args.problems] |
|
|
| if not all_problems: |
| print("No matching problems found.", file=sys.stderr) |
| sys.exit(1) |
|
|
| |
| rows = [] |
| for pid in all_problems: |
| stats_a = None |
| stats_b = None |
| if pid in dirs_a: |
| stats_a = get_scores_from_db(dirs_a[pid] / "evolution_db.sqlite") |
| if pid in dirs_b: |
| stats_b = get_scores_from_db(dirs_b[pid] / "evolution_db.sqlite") |
|
|
| score_a = stats_a["best_score"] if stats_a and "best_score" in stats_a else 0.0 |
| score_b = stats_b["best_score"] if stats_b and "best_score" in stats_b else 0.0 |
| diff = score_b - score_a |
| gen_a = stats_a.get("best_gen") if stats_a else None |
| gen_b = stats_b.get("best_gen") if stats_b else None |
|
|
| rows.append( |
| { |
| "pid": pid, |
| "score_a": score_a, |
| "score_b": score_b, |
| "diff": diff, |
| "gen_a": gen_a, |
| "gen_b": gen_b, |
| "stats_a": stats_a, |
| "stats_b": stats_b, |
| } |
| ) |
|
|
| |
| if args.sort == "diff": |
| rows.sort(key=lambda r: r["diff"], reverse=True) |
| elif args.sort == "score_a": |
| rows.sort(key=lambda r: r["score_a"], reverse=True) |
| elif args.sort == "score_b": |
| rows.sort(key=lambda r: r["score_b"], reverse=True) |
|
|
| |
| if args.legacy: |
| va = args.variant_a or "batch" |
| vb = args.dir_b |
| else: |
| va = Path(args.dir_a).name |
| vb = Path(args.dir_b).name |
| header = f"{'Problem':>8} {va:>10} {'gen':>4} {vb:>10} {'gen':>4} {'diff':>8} {'winner':>8}" |
| sep = "-" * len(header) |
| print(f"\n Comparison: {va} vs {vb}") |
| print(f" Dir A: {args.dir_a}") |
| print(f" Dir B: {args.dir_b}") |
| print(f" Problems: {len(rows)}\n") |
| print(header) |
| print(sep) |
|
|
| wins_a, wins_b, ties = 0, 0, 0 |
| sum_a, sum_b, sum_diff = 0.0, 0.0, 0.0 |
| count_both = 0 |
|
|
| for r in rows: |
| gen_a_str = f"{r['gen_a']:4d}" if r["gen_a"] is not None else " -" |
| gen_b_str = f"{r['gen_b']:4d}" if r["gen_b"] is not None else " -" |
|
|
| if abs(r["diff"]) < 0.01: |
| winner = "tie" |
| ties += 1 |
| elif r["diff"] > 0: |
| winner = vb |
| wins_b += 1 |
| else: |
| winner = va |
| wins_a += 1 |
|
|
| diff_str = f"{r['diff']:+8.2f}" |
| print( |
| f"{r['pid']:>8} {format_score(r['score_a']):>10} {gen_a_str} " |
| f"{format_score(r['score_b']):>10} {gen_b_str} {diff_str} {winner:>8}" |
| ) |
|
|
| if r["score_a"] > 0 or r["score_b"] > 0: |
| sum_a += r["score_a"] |
| sum_b += r["score_b"] |
| sum_diff += r["diff"] |
| count_both += 1 |
|
|
| print(sep) |
|
|
| |
| if count_both > 0: |
| avg_a = sum_a / count_both |
| avg_b = sum_b / count_both |
| avg_diff = sum_diff / count_both |
| print( |
| f"{'avg':>8} {avg_a:10.2f} {avg_b:10.2f} {avg_diff:+8.2f}" |
| ) |
| print( |
| f"{'total':>8} {sum_a:10.2f} {sum_b:10.2f} {sum_diff:+8.2f}" |
| ) |
|
|
| print(f"\n Wins: {va}={wins_a}, {vb}={wins_b}, ties={ties}") |
|
|
| if count_both > 0: |
| print(f" Average score: {va}={sum_a/count_both:.2f}, {vb}={sum_b/count_both:.2f} (diff={sum_diff/count_both:+.2f})") |
|
|
| |
| if args.csv: |
| csv_path = Path(args.csv) |
| csv_path.parent.mkdir(parents=True, exist_ok=True) |
| with open(csv_path, "w") as f: |
| f.write(f"problem,{va}_score,{va}_best_gen,{vb}_score,{vb}_best_gen,diff,winner\n") |
| for r in rows: |
| winner = "tie" if abs(r["diff"]) < 0.01 else (vb if r["diff"] > 0 else va) |
| gen_a = r["gen_a"] if r["gen_a"] is not None else "" |
| gen_b = r["gen_b"] if r["gen_b"] is not None else "" |
| f.write( |
| f"{r['pid']},{r['score_a']:.4f},{gen_a}," |
| f"{r['score_b']:.4f},{gen_b},{r['diff']:.4f},{winner}\n" |
| ) |
| print(f"\n CSV exported to {csv_path}") |
|
|
| print() |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|