File size: 10,905 Bytes

2facf1f

#!/usr/bin/env python3
"""Compare two experiment runs across multiple problems.

Usage:
    # New layout: two run directories, each containing p0/, p1/, ...
    python tasks/frontier_cs_entry/compare_experiments.py \
        results/frontier_cs_algorithmic/batch_g50_20260327_120000 \
        results/frontier_cs_algorithmic/agent_g50_20260327_130000

    # Legacy layout: shared root + variant name patterns
    python tasks/frontier_cs_entry/compare_experiments.py \
        results/frontier_cs_algorithmic batch agent --legacy

    # Options work with both layouts
    python tasks/frontier_cs_entry/compare_experiments.py dir_a dir_b --problems p0 p1 p2
    python tasks/frontier_cs_entry/compare_experiments.py dir_a dir_b --csv results/comparison.csv
"""

import argparse
import json
import os
import re
import sqlite3
import sys
from collections import defaultdict
from pathlib import Path


def find_experiment_dirs_new(run_dir: str) -> dict[str, Path]:
    """Find problem directories in a run directory (new layout: run_dir/p0/, p1/, ...).

    Returns dict mapping problem_id -> directory path.
    """
    run_path = Path(run_dir)
    if not run_path.exists():
        print(f"Error: {run_dir} does not exist", file=sys.stderr)
        sys.exit(1)

    problem_dirs = {}
    for d in sorted(run_path.iterdir()):
        if not d.is_dir():
            continue
        match = re.match(r"(p\d+)$", d.name)
        if match:
            pid = match.group(1)
            problem_dirs[pid] = d
    return problem_dirs


def find_experiment_dirs_legacy(results_dir: str, variant: str) -> dict[str, Path]:
    """Find experiment directories in legacy flat layout.

    Pattern: results_dir/p{id}_{variant}_g{gens}_{timestamp}/
    Returns dict mapping problem_id -> directory path.
    """
    results_path = Path(results_dir)
    if not results_path.exists():
        print(f"Error: {results_dir} does not exist", file=sys.stderr)
        sys.exit(1)

    problem_dirs = {}
    for d in sorted(results_path.iterdir()):
        if not d.is_dir():
            continue
        match = re.match(r"(p\d+)_(" + re.escape(variant) + r")_g\d+_\d+", d.name)
        if match:
            pid = match.group(1)
            # If multiple runs for same problem+variant, take the latest (last sorted)
            problem_dirs[pid] = d
    return problem_dirs


def get_scores_from_db(db_path: Path) -> dict:
    """Extract score statistics from an evolution_db.sqlite.

    Uses score_bounded from public_metrics when available (to avoid
    unbounded scores >100 inflating comparisons). Falls back to
    combined_score if public_metrics is missing or unparseable.
    """
    if not db_path.exists():
        return None

    try:
        conn = sqlite3.connect(str(db_path))
        cur = conn.cursor()

        # Read all programs and extract bounded score where possible
        cur.execute(
            "SELECT generation, combined_score, public_metrics FROM programs "
            "ORDER BY generation"
        )
        all_rows = cur.fetchall()

        def _extract_score(combined_score, public_metrics_json):
            """Return bounded score if available, else combined_score."""
            if public_metrics_json:
                try:
                    pm = json.loads(public_metrics_json)
                    if "score_bounded" in pm:
                        return pm["score_bounded"]
                except (json.JSONDecodeError, TypeError):
                    pass
            return combined_score or 0.0

        # Best score overall
        best_score = 0.0
        best_gen = None
        for gen, cs, pm in all_rows:
            s = _extract_score(cs, pm)
            if s > best_score:
                best_score = s
                best_gen = gen

        # Total generations and programs
        max_gen = max((r[0] for r in all_rows), default=0)
        total_programs = len(all_rows)

        # Running best per generation (cumulative max)
        gen_best = {}
        for gen, cs, pm in all_rows:
            s = _extract_score(cs, pm)
            if gen not in gen_best or s > gen_best[gen]:
                gen_best[gen] = s
        gen_scores = sorted(gen_best.items())

        # Compute cumulative best trajectory
        cum_best = []
        running_max = float("-inf")
        for gen, score in gen_scores:
            if score is not None and score > running_max:
                running_max = score
            cum_best.append((gen, running_max))

        # Score at generation milestones
        milestones = {}
        for gen, cb in cum_best:
            for m in [10, 20, 30, 40, 50]:
                if gen == m:
                    milestones[m] = cb

        conn.close()

        return {
            "best_score": best_score or 0.0,
            "best_gen": best_gen,
            "max_gen": max_gen,
            "total_programs": total_programs,
            "milestones": milestones,
            "trajectory": cum_best,
        }
    except Exception as e:
        return {"error": str(e)}


def format_score(score):
    if score is None or score == 0:
        return "  0.00"
    return f"{score:6.2f}"


def main():
    parser = argparse.ArgumentParser(
        description="Compare two experiment runs across problems"
    )
    parser.add_argument("dir_a", help="First run directory (or legacy results_dir)")
    parser.add_argument("dir_b", help="Second run directory (or legacy variant_b name)")
    parser.add_argument(
        "--legacy", action="store_true",
        help="Legacy mode: dir_a is shared results root, dir_b is variant_b name. "
             "Requires --variant-a.",
    )
    parser.add_argument("--variant-a", type=str, default=None, help="Legacy: variant_a name")
    parser.add_argument(
        "--problems", nargs="*", help="Only compare these problem IDs (e.g., p0 p1)"
    )
    parser.add_argument("--csv", help="Export results to CSV file")
    parser.add_argument(
        "--sort",
        choices=["problem", "diff", "score_a", "score_b"],
        default="problem",
        help="Sort order for output table",
    )
    args = parser.parse_args()

    # Find experiment directories
    if args.legacy:
        va = args.variant_a or "batch"
        vb = args.dir_b
        dirs_a = find_experiment_dirs_legacy(args.dir_a, va)
        dirs_b = find_experiment_dirs_legacy(args.dir_a, vb)
    else:
        dirs_a = find_experiment_dirs_new(args.dir_a)
        dirs_b = find_experiment_dirs_new(args.dir_b)

    all_problems = sorted(
        set(dirs_a.keys()) | set(dirs_b.keys()),
        key=lambda x: int(x[1:]),  # Sort by numeric ID
    )

    if args.problems:
        all_problems = [p for p in all_problems if p in args.problems]

    if not all_problems:
        print("No matching problems found.", file=sys.stderr)
        sys.exit(1)

    # Collect results
    rows = []
    for pid in all_problems:
        stats_a = None
        stats_b = None
        if pid in dirs_a:
            stats_a = get_scores_from_db(dirs_a[pid] / "evolution_db.sqlite")
        if pid in dirs_b:
            stats_b = get_scores_from_db(dirs_b[pid] / "evolution_db.sqlite")

        score_a = stats_a["best_score"] if stats_a and "best_score" in stats_a else 0.0
        score_b = stats_b["best_score"] if stats_b and "best_score" in stats_b else 0.0
        diff = score_b - score_a
        gen_a = stats_a.get("best_gen") if stats_a else None
        gen_b = stats_b.get("best_gen") if stats_b else None

        rows.append(
            {
                "pid": pid,
                "score_a": score_a,
                "score_b": score_b,
                "diff": diff,
                "gen_a": gen_a,
                "gen_b": gen_b,
                "stats_a": stats_a,
                "stats_b": stats_b,
            }
        )

    # Sort
    if args.sort == "diff":
        rows.sort(key=lambda r: r["diff"], reverse=True)
    elif args.sort == "score_a":
        rows.sort(key=lambda r: r["score_a"], reverse=True)
    elif args.sort == "score_b":
        rows.sort(key=lambda r: r["score_b"], reverse=True)

    # Print table
    if args.legacy:
        va = args.variant_a or "batch"
        vb = args.dir_b
    else:
        va = Path(args.dir_a).name
        vb = Path(args.dir_b).name
    header = f"{'Problem':>8}  {va:>10}  {'gen':>4}  {vb:>10}  {'gen':>4}  {'diff':>8}  {'winner':>8}"
    sep = "-" * len(header)
    print(f"\n  Comparison: {va} vs {vb}")
    print(f"  Dir A: {args.dir_a}")
    print(f"  Dir B: {args.dir_b}")
    print(f"  Problems: {len(rows)}\n")
    print(header)
    print(sep)

    wins_a, wins_b, ties = 0, 0, 0
    sum_a, sum_b, sum_diff = 0.0, 0.0, 0.0
    count_both = 0

    for r in rows:
        gen_a_str = f"{r['gen_a']:4d}" if r["gen_a"] is not None else "   -"
        gen_b_str = f"{r['gen_b']:4d}" if r["gen_b"] is not None else "   -"

        if abs(r["diff"]) < 0.01:
            winner = "tie"
            ties += 1
        elif r["diff"] > 0:
            winner = vb
            wins_b += 1
        else:
            winner = va
            wins_a += 1

        diff_str = f"{r['diff']:+8.2f}"
        print(
            f"{r['pid']:>8}  {format_score(r['score_a']):>10}  {gen_a_str}  "
            f"{format_score(r['score_b']):>10}  {gen_b_str}  {diff_str}  {winner:>8}"
        )

        if r["score_a"] > 0 or r["score_b"] > 0:
            sum_a += r["score_a"]
            sum_b += r["score_b"]
            sum_diff += r["diff"]
            count_both += 1

    print(sep)

    # Summary
    if count_both > 0:
        avg_a = sum_a / count_both
        avg_b = sum_b / count_both
        avg_diff = sum_diff / count_both
        print(
            f"{'avg':>8}  {avg_a:10.2f}        {avg_b:10.2f}        {avg_diff:+8.2f}"
        )
        print(
            f"{'total':>8}  {sum_a:10.2f}        {sum_b:10.2f}        {sum_diff:+8.2f}"
        )

    print(f"\n  Wins: {va}={wins_a}, {vb}={wins_b}, ties={ties}")

    if count_both > 0:
        print(f"  Average score: {va}={sum_a/count_both:.2f}, {vb}={sum_b/count_both:.2f} (diff={sum_diff/count_both:+.2f})")

    # CSV export
    if args.csv:
        csv_path = Path(args.csv)
        csv_path.parent.mkdir(parents=True, exist_ok=True)
        with open(csv_path, "w") as f:
            f.write(f"problem,{va}_score,{va}_best_gen,{vb}_score,{vb}_best_gen,diff,winner\n")
            for r in rows:
                winner = "tie" if abs(r["diff"]) < 0.01 else (vb if r["diff"] > 0 else va)
                gen_a = r["gen_a"] if r["gen_a"] is not None else ""
                gen_b = r["gen_b"] if r["gen_b"] is not None else ""
                f.write(
                    f"{r['pid']},{r['score_a']:.4f},{gen_a},"
                    f"{r['score_b']:.4f},{gen_b},{r['diff']:.4f},{winner}\n"
                )
        print(f"\n  CSV exported to {csv_path}")

    print()


if __name__ == "__main__":
    main()