File size: 10,905 Bytes
2facf1f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
#!/usr/bin/env python3
"""Compare two experiment runs across multiple problems.

Usage:
    # New layout: two run directories, each containing p0/, p1/, ...
    python tasks/frontier_cs_entry/compare_experiments.py \
        results/frontier_cs_algorithmic/batch_g50_20260327_120000 \
        results/frontier_cs_algorithmic/agent_g50_20260327_130000

    # Legacy layout: shared root + variant name patterns
    python tasks/frontier_cs_entry/compare_experiments.py \
        results/frontier_cs_algorithmic batch agent --legacy

    # Options work with both layouts
    python tasks/frontier_cs_entry/compare_experiments.py dir_a dir_b --problems p0 p1 p2
    python tasks/frontier_cs_entry/compare_experiments.py dir_a dir_b --csv results/comparison.csv
"""

import argparse
import json
import os
import re
import sqlite3
import sys
from collections import defaultdict
from pathlib import Path


def find_experiment_dirs_new(run_dir: str) -> dict[str, Path]:
    """Find problem directories in a run directory (new layout: run_dir/p0/, p1/, ...).

    Returns dict mapping problem_id -> directory path.
    """
    run_path = Path(run_dir)
    if not run_path.exists():
        print(f"Error: {run_dir} does not exist", file=sys.stderr)
        sys.exit(1)

    problem_dirs = {}
    for d in sorted(run_path.iterdir()):
        if not d.is_dir():
            continue
        match = re.match(r"(p\d+)$", d.name)
        if match:
            pid = match.group(1)
            problem_dirs[pid] = d
    return problem_dirs


def find_experiment_dirs_legacy(results_dir: str, variant: str) -> dict[str, Path]:
    """Find experiment directories in legacy flat layout.

    Pattern: results_dir/p{id}_{variant}_g{gens}_{timestamp}/
    Returns dict mapping problem_id -> directory path.
    """
    results_path = Path(results_dir)
    if not results_path.exists():
        print(f"Error: {results_dir} does not exist", file=sys.stderr)
        sys.exit(1)

    problem_dirs = {}
    for d in sorted(results_path.iterdir()):
        if not d.is_dir():
            continue
        match = re.match(r"(p\d+)_(" + re.escape(variant) + r")_g\d+_\d+", d.name)
        if match:
            pid = match.group(1)
            # If multiple runs for same problem+variant, take the latest (last sorted)
            problem_dirs[pid] = d
    return problem_dirs


def get_scores_from_db(db_path: Path) -> dict:
    """Extract score statistics from an evolution_db.sqlite.

    Uses score_bounded from public_metrics when available (to avoid
    unbounded scores >100 inflating comparisons). Falls back to
    combined_score if public_metrics is missing or unparseable.
    """
    if not db_path.exists():
        return None

    try:
        conn = sqlite3.connect(str(db_path))
        cur = conn.cursor()

        # Read all programs and extract bounded score where possible
        cur.execute(
            "SELECT generation, combined_score, public_metrics FROM programs "
            "ORDER BY generation"
        )
        all_rows = cur.fetchall()

        def _extract_score(combined_score, public_metrics_json):
            """Return bounded score if available, else combined_score."""
            if public_metrics_json:
                try:
                    pm = json.loads(public_metrics_json)
                    if "score_bounded" in pm:
                        return pm["score_bounded"]
                except (json.JSONDecodeError, TypeError):
                    pass
            return combined_score or 0.0

        # Best score overall
        best_score = 0.0
        best_gen = None
        for gen, cs, pm in all_rows:
            s = _extract_score(cs, pm)
            if s > best_score:
                best_score = s
                best_gen = gen

        # Total generations and programs
        max_gen = max((r[0] for r in all_rows), default=0)
        total_programs = len(all_rows)

        # Running best per generation (cumulative max)
        gen_best = {}
        for gen, cs, pm in all_rows:
            s = _extract_score(cs, pm)
            if gen not in gen_best or s > gen_best[gen]:
                gen_best[gen] = s
        gen_scores = sorted(gen_best.items())

        # Compute cumulative best trajectory
        cum_best = []
        running_max = float("-inf")
        for gen, score in gen_scores:
            if score is not None and score > running_max:
                running_max = score
            cum_best.append((gen, running_max))

        # Score at generation milestones
        milestones = {}
        for gen, cb in cum_best:
            for m in [10, 20, 30, 40, 50]:
                if gen == m:
                    milestones[m] = cb

        conn.close()

        return {
            "best_score": best_score or 0.0,
            "best_gen": best_gen,
            "max_gen": max_gen,
            "total_programs": total_programs,
            "milestones": milestones,
            "trajectory": cum_best,
        }
    except Exception as e:
        return {"error": str(e)}


def format_score(score):
    if score is None or score == 0:
        return "  0.00"
    return f"{score:6.2f}"


def main():
    parser = argparse.ArgumentParser(
        description="Compare two experiment runs across problems"
    )
    parser.add_argument("dir_a", help="First run directory (or legacy results_dir)")
    parser.add_argument("dir_b", help="Second run directory (or legacy variant_b name)")
    parser.add_argument(
        "--legacy", action="store_true",
        help="Legacy mode: dir_a is shared results root, dir_b is variant_b name. "
             "Requires --variant-a.",
    )
    parser.add_argument("--variant-a", type=str, default=None, help="Legacy: variant_a name")
    parser.add_argument(
        "--problems", nargs="*", help="Only compare these problem IDs (e.g., p0 p1)"
    )
    parser.add_argument("--csv", help="Export results to CSV file")
    parser.add_argument(
        "--sort",
        choices=["problem", "diff", "score_a", "score_b"],
        default="problem",
        help="Sort order for output table",
    )
    args = parser.parse_args()

    # Find experiment directories
    if args.legacy:
        va = args.variant_a or "batch"
        vb = args.dir_b
        dirs_a = find_experiment_dirs_legacy(args.dir_a, va)
        dirs_b = find_experiment_dirs_legacy(args.dir_a, vb)
    else:
        dirs_a = find_experiment_dirs_new(args.dir_a)
        dirs_b = find_experiment_dirs_new(args.dir_b)

    all_problems = sorted(
        set(dirs_a.keys()) | set(dirs_b.keys()),
        key=lambda x: int(x[1:]),  # Sort by numeric ID
    )

    if args.problems:
        all_problems = [p for p in all_problems if p in args.problems]

    if not all_problems:
        print("No matching problems found.", file=sys.stderr)
        sys.exit(1)

    # Collect results
    rows = []
    for pid in all_problems:
        stats_a = None
        stats_b = None
        if pid in dirs_a:
            stats_a = get_scores_from_db(dirs_a[pid] / "evolution_db.sqlite")
        if pid in dirs_b:
            stats_b = get_scores_from_db(dirs_b[pid] / "evolution_db.sqlite")

        score_a = stats_a["best_score"] if stats_a and "best_score" in stats_a else 0.0
        score_b = stats_b["best_score"] if stats_b and "best_score" in stats_b else 0.0
        diff = score_b - score_a
        gen_a = stats_a.get("best_gen") if stats_a else None
        gen_b = stats_b.get("best_gen") if stats_b else None

        rows.append(
            {
                "pid": pid,
                "score_a": score_a,
                "score_b": score_b,
                "diff": diff,
                "gen_a": gen_a,
                "gen_b": gen_b,
                "stats_a": stats_a,
                "stats_b": stats_b,
            }
        )

    # Sort
    if args.sort == "diff":
        rows.sort(key=lambda r: r["diff"], reverse=True)
    elif args.sort == "score_a":
        rows.sort(key=lambda r: r["score_a"], reverse=True)
    elif args.sort == "score_b":
        rows.sort(key=lambda r: r["score_b"], reverse=True)

    # Print table
    if args.legacy:
        va = args.variant_a or "batch"
        vb = args.dir_b
    else:
        va = Path(args.dir_a).name
        vb = Path(args.dir_b).name
    header = f"{'Problem':>8}  {va:>10}  {'gen':>4}  {vb:>10}  {'gen':>4}  {'diff':>8}  {'winner':>8}"
    sep = "-" * len(header)
    print(f"\n  Comparison: {va} vs {vb}")
    print(f"  Dir A: {args.dir_a}")
    print(f"  Dir B: {args.dir_b}")
    print(f"  Problems: {len(rows)}\n")
    print(header)
    print(sep)

    wins_a, wins_b, ties = 0, 0, 0
    sum_a, sum_b, sum_diff = 0.0, 0.0, 0.0
    count_both = 0

    for r in rows:
        gen_a_str = f"{r['gen_a']:4d}" if r["gen_a"] is not None else "   -"
        gen_b_str = f"{r['gen_b']:4d}" if r["gen_b"] is not None else "   -"

        if abs(r["diff"]) < 0.01:
            winner = "tie"
            ties += 1
        elif r["diff"] > 0:
            winner = vb
            wins_b += 1
        else:
            winner = va
            wins_a += 1

        diff_str = f"{r['diff']:+8.2f}"
        print(
            f"{r['pid']:>8}  {format_score(r['score_a']):>10}  {gen_a_str}  "
            f"{format_score(r['score_b']):>10}  {gen_b_str}  {diff_str}  {winner:>8}"
        )

        if r["score_a"] > 0 or r["score_b"] > 0:
            sum_a += r["score_a"]
            sum_b += r["score_b"]
            sum_diff += r["diff"]
            count_both += 1

    print(sep)

    # Summary
    if count_both > 0:
        avg_a = sum_a / count_both
        avg_b = sum_b / count_both
        avg_diff = sum_diff / count_both
        print(
            f"{'avg':>8}  {avg_a:10.2f}        {avg_b:10.2f}        {avg_diff:+8.2f}"
        )
        print(
            f"{'total':>8}  {sum_a:10.2f}        {sum_b:10.2f}        {sum_diff:+8.2f}"
        )

    print(f"\n  Wins: {va}={wins_a}, {vb}={wins_b}, ties={ties}")

    if count_both > 0:
        print(f"  Average score: {va}={sum_a/count_both:.2f}, {vb}={sum_b/count_both:.2f} (diff={sum_diff/count_both:+.2f})")

    # CSV export
    if args.csv:
        csv_path = Path(args.csv)
        csv_path.parent.mkdir(parents=True, exist_ok=True)
        with open(csv_path, "w") as f:
            f.write(f"problem,{va}_score,{va}_best_gen,{vb}_score,{vb}_best_gen,diff,winner\n")
            for r in rows:
                winner = "tie" if abs(r["diff"]) < 0.01 else (vb if r["diff"] > 0 else va)
                gen_a = r["gen_a"] if r["gen_a"] is not None else ""
                gen_b = r["gen_b"] if r["gen_b"] is not None else ""
                f.write(
                    f"{r['pid']},{r['score_a']:.4f},{gen_a},"
                    f"{r['score_b']:.4f},{gen_b},{r['diff']:.4f},{winner}\n"
                )
        print(f"\n  CSV exported to {csv_path}")

    print()


if __name__ == "__main__":
    main()