Cross-stage experiments: code, paper figures (solve/exact/precision/recall)

- _experiments/cross_stage/: per-cell prediction sweep across (method, train_stage, prompt_stage)
with analyze scripts that compute containment, difficulty-stratified solve
rate, failure-mode taxonomy, and a 3x3 cross-prompt grid.
- _runs/_paper_figures/: updated solve/exact and new precision/recall plots
with No-CoT-No-Curriculum horizontal baseline.

Co-authored-by: Cursor <cursoragent@cursor.com>

Files changed (21) hide show

.gitignore +11 -0
_experiments/cross_stage/_peek.py +49 -0
_experiments/cross_stage/analyze.py +236 -0
_experiments/cross_stage/analyze_cross_prompt.py +303 -0
_experiments/cross_stage/analyze_v2.py +425 -0
_experiments/cross_stage/overnight_pipeline.sh +58 -0
_experiments/cross_stage/predict_one.py +211 -0
_experiments/cross_stage/run_all.sh +58 -0
_experiments/cross_stage/run_cross_prompt.sh +66 -0
_experiments/cross_stage/run_cross_prompt_phase2.sh +93 -0
_experiments/cross_stage/run_nocurr_cot.sh +79 -0
_experiments/cross_stage/watcher_launch_more.sh +32 -0
_runs/_paper_figures/plot_stage_progression.py +80 -20
_runs/_paper_figures/stage_progression_exact.pdf +2 -2
_runs/_paper_figures/stage_progression_exact.png +2 -2
_runs/_paper_figures/stage_progression_precision.pdf +3 -0
_runs/_paper_figures/stage_progression_precision.png +3 -0
_runs/_paper_figures/stage_progression_recall.pdf +3 -0
_runs/_paper_figures/stage_progression_recall.png +3 -0
_runs/_paper_figures/stage_progression_solve.pdf +2 -2
_runs/_paper_figures/stage_progression_solve.png +2 -2

.gitignore CHANGED Viewed

@@ -30,4 +30,15 @@ _pushlogs/
 _runs/strawman_warm_*/
 _runs/adaptive_k_resume_*/
 _runs/launch_finish_repos23_*_pids.txt
 curriculum_cot/

 _runs/strawman_warm_*/
 _runs/adaptive_k_resume_*/
 _runs/launch_finish_repos23_*_pids.txt
+_runs/nocurr_cot_*/
 curriculum_cot/
+# Cross-stage prediction dumps (kept on the workspace, not in code repo)
+_experiments/cross_stage/preds/
+_experiments/cross_stage/preds_xprompt/
+_experiments/cross_stage/logs/
+_experiments/cross_stage/logs_xprompt/
+_experiments/cross_stage/figs/
+_experiments/cross_stage/figs_xprompt/
+_experiments/cross_stage/*.log

_experiments/cross_stage/_peek.py ADDED Viewed

	@@ -0,0 +1,49 @@

+"""Quick peek at completed cross-prompt outputs."""
+import json
+from pathlib import Path
+DIAG = Path("/home/ubuntu/curriculum_cot/_experiments/cross_stage/preds")
+XP   = Path("/home/ubuntu/curriculum_cot/_experiments/cross_stage/preds_xprompt")
+def load(p):
+    out = []
+    with open(p) as f:
+        for line in f:
+            line = line.strip()
+            if line:
+                out.append(json.loads(line))
+    return out
+def summarize(tag, recs, target_key):
+    if not recs:
+        print(f"{tag}: no data"); return
+    n = 0; em = 0; subset = 0; size_sum = 0
+    for r in recs:
+        if not r.get("parse_ok"):
+            continue
+        p = tuple(sorted(r["predicted_values"]))
+        t = tuple(sorted(r.get(target_key, [])))
+        n += 1
+        if p == t:
+            em += 1
+        if p and t and set(p).issubset(set(t)):
+            subset += 1
+        size_sum += len(p)
+    print(f"{tag:32s}  n={n:4d}  exact={em/max(1,n):.3f}  subset={subset/max(1,n):.3f}  avg|p|={size_sum/max(1,n):.2f}")
+print("=== Diagonal (already had) ===")
+for tag, t_key in [("atc_s1","target_S1"),("atc_s2","target_S2"),("atc_s3","target_S3"),
+                   ("dc_s1","target_S1"),("dc_s2","target_S2"),("dc_s3","target_S3")]:
+    p = DIAG / f"{tag}.jsonl"
+    if p.exists(): summarize(tag, load(p), t_key)
+print()
+print("=== Off-diagonal cross-prompt ===")
+for tag in ["atc_train3_prompt1","atc_train3_prompt2","atc_train2_prompt3",
+            "dc_train3_prompt1","dc_train3_prompt2"]:
+    p = XP / f"{tag}.jsonl"
+    if not p.exists() or not p.stat().st_size:
+        print(f"{tag}: (missing)"); continue
+    # prompt stage is the trailing digit
+    q = int(tag.split("prompt")[1])
+    summarize(tag + f" [eval vs S{q}]", load(p), f"target_S{q}")

_experiments/cross_stage/analyze.py ADDED Viewed

	@@ -0,0 +1,236 @@

+"""Aggregate per-cell predictions across (method, stage) and produce
+containment / catastrophic-rewrite plots + per-cell trajectory visualizations.
+Expects six JSONL files in --preds_dir:  atc_s{1,2,3}.jsonl, dc_s{1,2,3}.jsonl
+(each produced by predict_one.py).
+Produces:
+  - containment_summary.json (numeric report)
+  - fig_containment.{pdf,png}   (3 grouped bars per method)
+  - fig_sankey_example.{pdf,png} (one 9x9 grid of per-cell trajectories for 1 puzzle)
+"""
+from __future__ import annotations
+import argparse
+import json
+from collections import defaultdict
+from pathlib import Path
+import matplotlib as mpl
+import matplotlib.pyplot as plt
+METHODS = ["atc", "dc"]
+STAGES = [1, 2, 3]
+def load_preds(preds_dir: Path):
+    """Return dict[(method, stage)] -> dict[(puzzle_id, target_cell)] -> record."""
+    out = {}
+    for m in METHODS:
+        for s in STAGES:
+            tag = f"{m}_s{s}"
+            path = preds_dir / f"{tag}.jsonl"
+            d = {}
+            if not path.exists():
+                print(f"WARN missing {path}")
+                out[(m, s)] = d
+                continue
+            with open(path) as f:
+                for line in f:
+                    line = line.strip()
+                    if not line:
+                        continue
+                    r = json.loads(line)
+                    key = (int(r["puzzle_id"]), tuple(r["target_cell"]))
+                    d[key] = r
+            out[(m, s)] = d
+            print(f"loaded {tag}: {len(d)} cells")
+    return out
+def cells_common(preds):
+    """Intersection of cell keys across all 6 (method, stage) files."""
+    sets = [set(preds[(m, s)].keys()) for m in METHODS for s in STAGES if preds[(m, s)]]
+    if not sets:
+        return set()
+    common = sets[0]
+    for s in sets[1:]:
+        common &= s
+    return sorted(common)
+def containment(pred_set, ref_set):
+    """Return 1 if pred_set non-empty and pred_set ⊆ ref_set, else 0.
+    Empty prediction or empty reference -> 0."""
+    if not pred_set or not ref_set:
+        return 0
+    return int(set(pred_set).issubset(set(ref_set)))
+def disjoint(a, b):
+    return int(bool(a) and bool(b) and not (set(a) & set(b)))
+def compute_metrics(preds, common_cells):
+    """For each method, aggregate per-cell stats."""
+    out = {}
+    for m in METHODS:
+        n = 0
+        c13 = c23 = c12 = 0
+        rew_3_disjoint_1 = 0
+        rew_3_disjoint_2 = 0
+        size_s1 = size_s2 = size_s3 = 0
+        for key in common_cells:
+            r1 = preds[(m, 1)][key]
+            r2 = preds[(m, 2)][key]
+            r3 = preds[(m, 3)][key]
+            p1 = r1["predicted_values"]
+            p2 = r2["predicted_values"]
+            p3 = r3["predicted_values"]
+            if not (r1["parse_ok"] and r2["parse_ok"] and r3["parse_ok"]):
+                continue
+            n += 1
+            c13 += containment(p3, p1)
+            c23 += containment(p3, p2)
+            c12 += containment(p2, p1)
+            rew_3_disjoint_1 += disjoint(p3, p1)
+            rew_3_disjoint_2 += disjoint(p3, p2)
+            size_s1 += len(p1)
+            size_s2 += len(p2)
+            size_s3 += len(p3)
+        out[m] = {
+            "n": n,
+            "containment_S3_in_S1": c13 / max(1, n),
+            "containment_S3_in_S2": c23 / max(1, n),
+            "containment_S2_in_S1": c12 / max(1, n),
+            "catastrophic_S3_disjoint_S1": rew_3_disjoint_1 / max(1, n),
+            "catastrophic_S3_disjoint_S2": rew_3_disjoint_2 / max(1, n),
+            "avg_predicted_size_S1": size_s1 / max(1, n),
+            "avg_predicted_size_S2": size_s2 / max(1, n),
+            "avg_predicted_size_S3": size_s3 / max(1, n),
+        }
+    return out
+# ---------- plotting ----------------------------------------------------
+mpl.rcParams.update({
+    "font.family": "serif",
+    "font.serif": ["DejaVu Serif", "Times New Roman", "Times", "Liberation Serif"],
+    "font.size": 12,
+    "axes.labelsize": 12,
+    "xtick.labelsize": 11,
+    "ytick.labelsize": 11,
+    "legend.fontsize": 11,
+    "axes.spines.top": False,
+    "axes.spines.right": False,
+    "axes.linewidth": 1.0,
+    "lines.linewidth": 2.0,
+    "pdf.fonttype": 42,
+    "ps.fonttype": 42,
+})
+ATC_COLOR = "#1f4f8b"
+DC_COLOR = "#b21e2f"
+def plot_containment(metrics, out_path):
+    fig, ax = plt.subplots(figsize=(5.2, 3.6), constrained_layout=True)
+    groups = [
+        ("$\\hat S_3 \\subseteq \\hat S_1$", "containment_S3_in_S1"),
+        ("$\\hat S_3 \\subseteq \\hat S_2$", "containment_S3_in_S2"),
+        ("$\\hat S_3 \\cap \\hat S_1 = \\varnothing$", "catastrophic_S3_disjoint_S1"),
+    ]
+    x = list(range(len(groups)))
+    w = 0.36
+    atc_vals = [metrics["atc"][k] for _, k in groups]
+    dc_vals = [metrics["dc"][k] for _, k in groups]
+    ax.bar([xi - w/2 for xi in x], atc_vals, w, color=ATC_COLOR, label="ATC", edgecolor="none")
+    ax.bar([xi + w/2 for xi in x], dc_vals,  w, color=DC_COLOR,  label="Data Curriculum", edgecolor="none")
+    for xi, v in zip(x, atc_vals):
+        ax.text(xi - w/2, v + 0.015, f"{v:.2f}", ha="center", va="bottom", fontsize=10, color=ATC_COLOR)
+    for xi, v in zip(x, dc_vals):
+        ax.text(xi + w/2, v + 0.015, f"{v:.2f}", ha="center", va="bottom", fontsize=10, color=DC_COLOR)
+    ax.set_xticks(x, [lbl for lbl, _ in groups])
+    ax.set_ylim(0, 1.05)
+    ax.set_ylabel("Fraction of cells")
+    ax.legend(frameon=False, loc="upper right")
+    fig.savefig(out_path.with_suffix(".pdf"), bbox_inches="tight")
+    fig.savefig(out_path.with_suffix(".png"), dpi=300, bbox_inches="tight")
+    plt.close(fig)
+    print(f"saved {out_path}.pdf / .png")
+def plot_sankey_grid(preds, out_path, puzzle_id=0):
+    """For one puzzle, render a 9x9 grid where each empty cell shows three
+    columns of candidate values (S1 / S2 / S3) per method, color-coded by
+    whether each value survives from S1 to S3.
+    """
+    fig, axes = plt.subplots(1, 2, figsize=(9, 4.5), constrained_layout=True)
+    for ax, method, title in zip(axes, ["atc", "dc"], ["ATC (latent + curriculum)", "Data Curriculum (no CoT)"]):
+        cells = []
+        for key, r3 in sorted(preds[(method, 3)].items()):
+            if key[0] != puzzle_id:
+                continue
+            p1 = preds[(method, 1)].get(key, {}).get("predicted_values") or []
+            p2 = preds[(method, 2)].get(key, {}).get("predicted_values") or []
+            p3 = r3.get("predicted_values") or []
+            cells.append((key[1], p1, p2, p3, r3.get("target_solution")))
+        n = len(cells)
+        if n == 0:
+            ax.text(0.5, 0.5, "(no data)", transform=ax.transAxes, ha="center")
+            ax.set_title(title)
+            continue
+        ax.set_xlim(0, 3)
+        ax.set_ylim(-0.5, n - 0.5)
+        for i, (cell_rc, p1, p2, p3, gt) in enumerate(cells):
+            r, c = cell_rc
+            ax.text(-0.4, n - 1 - i, f"({r+1},{c+1})", va="center", ha="right", fontsize=8, color="0.4")
+            for j, vals, x_center in [(0, p1, 0.5), (1, p2, 1.5), (2, p3, 2.5)]:
+                txt = ",".join(str(v) for v in vals) if vals else "—"
+                ax.text(x_center, n - 1 - i, txt, va="center", ha="center", fontsize=9)
+            in_p1 = bool(p3 and set(p3).issubset(set(p1))) if p1 else False
+            color = "0.85" if in_p1 else "#f5b7b1"
+            ax.axhspan(n - 1 - i - 0.5, n - 1 - i + 0.5, facecolor=color, alpha=0.4, zorder=0)
+        ax.set_xticks([0.5, 1.5, 2.5], ["S1", "S2", "S3"])
+        ax.set_yticks([])
+        ax.set_title(title, fontsize=11)
+        ax.spines["left"].set_visible(False)
+    fig.savefig(out_path.with_suffix(".pdf"), bbox_inches="tight")
+    fig.savefig(out_path.with_suffix(".png"), dpi=300, bbox_inches="tight")
+    plt.close(fig)
+    print(f"saved {out_path}.pdf / .png")
+def main():
+    p = argparse.ArgumentParser()
+    p.add_argument("--preds_dir", required=True)
+    p.add_argument("--out_dir", required=True)
+    p.add_argument("--example_puzzle", type=int, default=0)
+    args = p.parse_args()
+    preds_dir = Path(args.preds_dir)
+    out_dir = Path(args.out_dir)
+    out_dir.mkdir(parents=True, exist_ok=True)
+    preds = load_preds(preds_dir)
+    common = cells_common(preds)
+    print(f"common cells across all 6 files: {len(common)}")
+    metrics = compute_metrics(preds, common)
+    summary = {
+        "n_common_cells": len(common),
+        "metrics": metrics,
+    }
+    with open(out_dir / "containment_summary.json", "w") as f:
+        json.dump(summary, f, indent=2)
+    print(json.dumps(metrics, indent=2))
+    plot_containment(metrics, out_dir / "fig_containment")
+    plot_sankey_grid(preds, out_dir / "fig_sankey_example", puzzle_id=args.example_puzzle)
+if __name__ == "__main__":
+    main()

_experiments/cross_stage/analyze_cross_prompt.py ADDED Viewed

	@@ -0,0 +1,303 @@

+"""Analyse cross-prompt evaluations.
+For each (method, train_stage, prompt_stage) pair we have a JSONL of per-cell
+records produced by predict_one.py. The diagonals (train_stage == prompt_stage)
+live in ../preds/ ; the off-diagonals (this experiment) live in ../preds_xprompt/.
+Headline question: does each model still do the OFF-DIAGONAL task correctly?
+We measure:
+  - exact_set_match_vs_target  : predicted_values == target_S{prompt_stage}
+  - subset_of_target           : predicted_values ⊆ target_S{prompt_stage}
+  - avg |predicted set|
+  - "drift" : exact_set_match vs predicted of the model's ORIGINAL training
+                stage (i.e. does ATC_S3 prompted with S1 still produce its
+                own S3 answer? -> indicates that the prompt was ignored)
+Plots:
+  fig_xprompt_solve_grid.{pdf,png}  - 2-method × 3 prompt_stage × 3 train_stage
+                                      heatmap of exact-set-match accuracy
+  fig_xprompt_setsize.{pdf,png}     - avg |pred| for each (train, prompt) cell
+                                      grouped by method
+  fig_xprompt_forgetting.{pdf,png}  - "forward compat": for the S3 adapter
+                                      prompted with stage_i in {1,2,3}, plot
+                                      exact-match accuracy.  Both methods.
+"""
+from __future__ import annotations
+import argparse
+import json
+from collections import defaultdict
+from pathlib import Path
+import numpy as np
+import matplotlib as mpl
+import matplotlib.pyplot as plt
+METHODS = ["atc", "dc"]
+STAGES = [1, 2, 3]
+ATC_COLOR = "#1f4f8b"; DC_COLOR = "#b21e2f"
+COLOR = {"atc": ATC_COLOR, "dc": DC_COLOR}
+PRETTY = {"atc": "ATC", "dc": "Data Curriculum"}
+mpl.rcParams.update({
+    "font.family": "serif",
+    "font.serif": ["DejaVu Serif", "Times New Roman", "Times", "Liberation Serif"],
+    "font.size": 12,
+    "axes.labelsize": 12,
+    "xtick.labelsize": 11,
+    "ytick.labelsize": 11,
+    "legend.fontsize": 10,
+    "axes.spines.top": False,
+    "axes.spines.right": False,
+    "axes.linewidth": 1.0,
+    "lines.linewidth": 2.0,
+    "lines.markersize": 7,
+    "pdf.fonttype": 42,
+    "ps.fonttype": 42,
+})
+def parse_tag(tag: str):
+    """
+    Returns (method, train_stage, prompt_stage).
+    Diagonal:   atc_s3                       -> ("atc", 3, 3)
+    Off-diag:   atc_train3_prompt1           -> ("atc", 3, 1)
+    """
+    parts = tag.split("_")
+    method = parts[0]
+    if len(parts) == 2 and parts[1].startswith("s"):
+        s = int(parts[1][1:])
+        return method, s, s
+    # expect *_trainK_promptM
+    train = int([p for p in parts if p.startswith("train")][0][5:])
+    prompt = int([p for p in parts if p.startswith("prompt")][0][6:])
+    return method, train, prompt
+def load_dir(p: Path):
+    by_key = {}  # (method, train, prompt) -> { (puzzle, cell) -> record }
+    for path in sorted(p.glob("*.jsonl")):
+        tag = path.stem
+        try:
+            m, t, q = parse_tag(tag)
+        except Exception:
+            continue
+        d = {}
+        with open(path) as f:
+            for line in f:
+                line = line.strip()
+                if not line:
+                    continue
+                r = json.loads(line)
+                d[(int(r["puzzle_id"]), tuple(r["target_cell"]))] = r
+        by_key[(m, t, q)] = d
+        print(f"loaded {tag}: {len(d)} cells -> (method={m}, train={t}, prompt={q})")
+    return by_key
+def target_field(rec, q):
+    return tuple(rec.get(f"target_S{q}", []))
+def aggregate(by_key):
+    """For each (method, train, prompt) cell compute summary metrics."""
+    rows = []
+    for (m, t, q), d in by_key.items():
+        n = 0; em = 0; subset = 0; size_sum = 0
+        for rec in d.values():
+            if not rec.get("parse_ok"):
+                continue
+            n += 1
+            pred = tuple(sorted(rec["predicted_values"]))
+            targ = tuple(sorted(target_field(rec, q)))
+            if pred == targ:
+                em += 1
+            if pred and targ and set(pred).issubset(set(targ)):
+                subset += 1
+            size_sum += len(pred)
+        rows.append({
+            "method": m, "train": t, "prompt": q, "n": n,
+            "exact_match_vs_prompt_target": em / max(1, n),
+            "subset_of_prompt_target": subset / max(1, n),
+            "avg_pred_size": size_sum / max(1, n),
+        })
+    return rows
+def drift_from_diagonal(by_key):
+    """
+    For each off-diagonal cell (train=t, prompt=q, q != t), measure
+      Frac of cells where pred(train=t, prompt=q) == pred(train=t, prompt=t).
+    If high  -> model ignored the prompt change (anchored to training stage).
+    If low   -> model actually changed behaviour with prompt.
+    """
+    out = []
+    for (m, t, q), d_q in by_key.items():
+        if q == t:
+            continue
+        d_t = by_key.get((m, t, t))
+        if not d_t:
+            continue
+        n = 0; same = 0
+        for key, rec_q in d_q.items():
+            rec_t = d_t.get(key)
+            if not rec_t or not rec_q.get("parse_ok") or not rec_t.get("parse_ok"):
+                continue
+            n += 1
+            same += int(tuple(sorted(rec_q["predicted_values"])) ==
+                        tuple(sorted(rec_t["predicted_values"])))
+        out.append({"method": m, "train": t, "prompt": q,
+                    "n": n, "frac_ignored_prompt": same / max(1, n)})
+    return out
+# ----------------------- PLOTS ----------------------------
+def plot_solve_grid(rows, out_path):
+    """3x3 train×prompt heatmap of exact_match for each method, side by side."""
+    fig, axes = plt.subplots(1, 2, figsize=(8.2, 3.6), constrained_layout=True)
+    for ax, m in zip(axes, METHODS):
+        grid = np.full((3, 3), np.nan)
+        for r in rows:
+            if r["method"] != m:
+                continue
+            grid[r["train"] - 1, r["prompt"] - 1] = r["exact_match_vs_prompt_target"]
+        im = ax.imshow(grid, vmin=0.0, vmax=1.0, cmap="Blues")
+        ax.set_xticks([0, 1, 2], ["S1", "S2", "S3"])
+        ax.set_yticks([0, 1, 2], ["S1", "S2", "S3"])
+        ax.set_xlabel("Prompt stage_i")
+        ax.set_ylabel("Trained stage")
+        ax.set_title(PRETTY[m], fontsize=11)
+        for i in range(3):
+            for j in range(3):
+                v = grid[i, j]
+                if not np.isnan(v):
+                    ax.text(j, i, f"{v:.2f}", ha="center", va="center",
+                            color="white" if v > 0.5 else "black", fontsize=10)
+    cb = fig.colorbar(im, ax=axes.ravel().tolist(), shrink=0.85, fraction=0.05)
+    cb.set_label("Exact set match vs prompt target")
+    fig.savefig(out_path.with_suffix(".pdf"), bbox_inches="tight")
+    fig.savefig(out_path.with_suffix(".png"), dpi=300, bbox_inches="tight")
+    plt.close(fig)
+    print(f"saved {out_path}.pdf/.png")
+def plot_forward_compat(rows, out_path):
+    """For each method, S3-adapter prompted with stage_i=1/2/3: exact-match."""
+    fig, ax = plt.subplots(figsize=(5.0, 3.6), constrained_layout=True)
+    x = [1, 2, 3]
+    for m, marker, ls in [("atc", "s", "-"), ("dc", "o", "--")]:
+        y = []
+        for q in [1, 2, 3]:
+            for r in rows:
+                if r["method"] == m and r["train"] == 3 and r["prompt"] == q:
+                    y.append(r["exact_match_vs_prompt_target"])
+                    break
+            else:
+                y.append(np.nan)
+        ax.plot(x, y, color=COLOR[m], marker=marker, linestyle=ls,
+                label=PRETTY[m])
+        for xi, v in zip(x, y):
+            if not np.isnan(v):
+                ax.text(xi, v + 0.02, f"{v:.2f}", ha="center", va="bottom",
+                        fontsize=9, color=COLOR[m])
+    ax.set_xticks(x, ["Ask S1", "Ask S2", "Ask S3"])
+    ax.set_xlabel("Prompt task")
+    ax.set_ylim(0.0, 1.05)
+    ax.set_ylabel("Exact set-match on prompted task")
+    ax.legend(frameon=False, loc="lower left")
+    fig.savefig(out_path.with_suffix(".pdf"), bbox_inches="tight")
+    fig.savefig(out_path.with_suffix(".png"), dpi=300, bbox_inches="tight")
+    plt.close(fig)
+    print(f"saved {out_path}.pdf/.png")
+def plot_pred_size_grid(rows, out_path):
+    fig, axes = plt.subplots(1, 2, figsize=(8.2, 3.6), constrained_layout=True)
+    for ax, m in zip(axes, METHODS):
+        grid = np.full((3, 3), np.nan)
+        for r in rows:
+            if r["method"] != m:
+                continue
+            grid[r["train"] - 1, r["prompt"] - 1] = r["avg_pred_size"]
+        im = ax.imshow(grid, vmin=1.0, vmax=2.5, cmap="Oranges")
+        ax.set_xticks([0, 1, 2], ["S1", "S2", "S3"])
+        ax.set_yticks([0, 1, 2], ["S1", "S2", "S3"])
+        ax.set_xlabel("Prompt stage_i")
+        ax.set_ylabel("Trained stage")
+        ax.set_title(PRETTY[m], fontsize=11)
+        for i in range(3):
+            for j in range(3):
+                v = grid[i, j]
+                if not np.isnan(v):
+                    ax.text(j, i, f"{v:.2f}", ha="center", va="center",
+                            color="white" if v > 1.8 else "black", fontsize=10)
+    cb = fig.colorbar(im, ax=axes.ravel().tolist(), shrink=0.85, fraction=0.05)
+    cb.set_label("Avg |predicted candidate set|")
+    fig.savefig(out_path.with_suffix(".pdf"), bbox_inches="tight")
+    fig.savefig(out_path.with_suffix(".png"), dpi=300, bbox_inches="tight")
+    plt.close(fig)
+    print(f"saved {out_path}.pdf/.png")
+def plot_prompt_responsiveness(drift_rows, out_path):
+    """For each off-diagonal cell, plot frac_ignored_prompt (low = good)."""
+    fig, ax = plt.subplots(figsize=(5.8, 3.6), constrained_layout=True)
+    labels = []
+    atc_vals = []; dc_vals = []
+    pairs = sorted({(r["train"], r["prompt"]) for r in drift_rows})
+    for (t, q) in pairs:
+        labels.append(f"S{t}→S{q}")
+        atc_vals.append(next((r["frac_ignored_prompt"] for r in drift_rows
+                              if r["method"] == "atc" and r["train"] == t and r["prompt"] == q), np.nan))
+        dc_vals.append(next((r["frac_ignored_prompt"] for r in drift_rows
+                             if r["method"] == "dc" and r["train"] == t and r["prompt"] == q), np.nan))
+    x = list(range(len(labels)))
+    w = 0.36
+    ax.bar([xi - w/2 for xi in x], atc_vals, w, color=ATC_COLOR, label="ATC", edgecolor="none")
+    ax.bar([xi + w/2 for xi in x], dc_vals,  w, color=DC_COLOR,  label="Data Curriculum", edgecolor="none")
+    for xi, v in zip(x, atc_vals):
+        if not np.isnan(v):
+            ax.text(xi - w/2, v + 0.01, f"{v:.2f}", ha="center", va="bottom", fontsize=9, color=ATC_COLOR)
+    for xi, v in zip(x, dc_vals):
+        if not np.isnan(v):
+            ax.text(xi + w/2, v + 0.01, f"{v:.2f}", ha="center", va="bottom", fontsize=9, color=DC_COLOR)
+    ax.set_xticks(x, labels)
+    ax.set_ylim(0, 1.05)
+    ax.set_ylabel("Frac cells with prediction ≡ same model's train-stage answer\n(low = model actually responded to new prompt)")
+    ax.legend(frameon=False, loc="upper left")
+    fig.savefig(out_path.with_suffix(".pdf"), bbox_inches="tight")
+    fig.savefig(out_path.with_suffix(".png"), dpi=300, bbox_inches="tight")
+    plt.close(fig)
+    print(f"saved {out_path}.pdf/.png")
+def main():
+    ap = argparse.ArgumentParser()
+    ap.add_argument("--diag_dir", required=True)
+    ap.add_argument("--xprompt_dir", required=True)
+    ap.add_argument("--out_dir", required=True)
+    args = ap.parse_args()
+    out = Path(args.out_dir); out.mkdir(parents=True, exist_ok=True)
+    by_key = load_dir(Path(args.diag_dir))
+    by_key.update(load_dir(Path(args.xprompt_dir)))
+    rows = aggregate(by_key)
+    drift = drift_from_diagonal(by_key)
+    with open(out / "xprompt_summary.json", "w") as f:
+        json.dump({"rows": rows, "drift": drift}, f, indent=2)
+    print(json.dumps({"rows": rows, "drift": drift}, indent=2))
+    plot_solve_grid(rows, out / "fig_xprompt_solve_grid")
+    plot_pred_size_grid(rows, out / "fig_xprompt_setsize")
+    plot_forward_compat(rows, out / "fig_xprompt_forward_compat")
+    if drift:
+        plot_prompt_responsiveness(drift, out / "fig_xprompt_prompt_response")
+if __name__ == "__main__":
+    main()

_experiments/cross_stage/analyze_v2.py ADDED Viewed

	@@ -0,0 +1,425 @@

+"""Extended cross-stage containment analyses.
+Reads the 6 JSONL files produced by predict_one.py (one per method-stage cell)
+and emits multiple plots that probe HOW latent CoT propagates constraints
+versus the vanilla data-curriculum baseline.
+Plots produced (PDF + PNG):
+  fig_containment_basic      - 3 grouped bars: S3⊆S1, S3⊆S2, S3∩S1=∅
+  fig_containment_by_diff    - same 3 bars BROKEN DOWN by ground-truth |S1|
+                               (cell difficulty axis = |true legal candidate set|)
+  fig_set_size_trajectory    - avg predicted set size at S1/S2/S3 per method
+  fig_correctness_breakdown  - among incorrect S3 predictions, what fraction
+                               stays inside S1 / S2 vs. is catastrophic?
+  fig_method_agreement       - fraction of cells where ATC.S3 == DC.S3, broken
+                               down by ground-truth difficulty
+  fig_sankey_example         - per-cell value trajectory for one puzzle
+                               (existing in analyze.py, refreshed here)
+"""
+from __future__ import annotations
+import argparse
+import json
+from collections import defaultdict
+from pathlib import Path
+from typing import Dict, List, Tuple
+import numpy as np
+import matplotlib as mpl
+import matplotlib.pyplot as plt
+METHODS = ["atc", "dc"]
+STAGES = [1, 2, 3]
+METHOD_PRETTY = {"atc": "ATC", "dc": "Data Curriculum"}
+ATC_COLOR = "#1f4f8b"
+DC_COLOR = "#b21e2f"
+COLOR = {"atc": ATC_COLOR, "dc": DC_COLOR}
+mpl.rcParams.update({
+    "font.family": "serif",
+    "font.serif": ["DejaVu Serif", "Times New Roman", "Times", "Liberation Serif"],
+    "font.size": 12,
+    "axes.labelsize": 12,
+    "xtick.labelsize": 11,
+    "ytick.labelsize": 11,
+    "legend.fontsize": 10,
+    "axes.spines.top": False,
+    "axes.spines.right": False,
+    "axes.linewidth": 1.0,
+    "lines.linewidth": 2.0,
+    "lines.markersize": 7,
+    "pdf.fonttype": 42,
+    "ps.fonttype": 42,
+})
+def load_preds(preds_dir: Path):
+    out = {}
+    for m in METHODS:
+        for s in STAGES:
+            tag = f"{m}_s{s}"
+            d = {}
+            path = preds_dir / f"{tag}.jsonl"
+            if path.exists():
+                with open(path) as f:
+                    for line in f:
+                        line = line.strip()
+                        if not line:
+                            continue
+                        r = json.loads(line)
+                        d[(int(r["puzzle_id"]), tuple(r["target_cell"]))] = r
+            out[(m, s)] = d
+    return out
+def cells_common(preds):
+    sets = [set(preds[(m, s)].keys()) for m in METHODS for s in STAGES if preds[(m, s)]]
+    if not sets:
+        return []
+    common = sets[0]
+    for s in sets[1:]:
+        common &= s
+    return sorted(common)
+def diff_bucket(target_s1):
+    n = len(target_s1)
+    if n <= 1:
+        return "|S1|=1"
+    if n == 2:
+        return "|S1|=2"
+    if n == 3:
+        return "|S1|=3"
+    return "|S1|≥4"
+DIFF_ORDER = ["|S1|=1", "|S1|=2", "|S1|=3", "|S1|≥4"]
+def _safe_div(a, b):
+    return float(a) / float(b) if b else 0.0
+def compute_per_difficulty(preds, common):
+    """For each method × difficulty bucket compute containment metrics."""
+    rows = []
+    for m in METHODS:
+        per_bucket = {b: defaultdict(int) for b in DIFF_ORDER}
+        for key in common:
+            r1 = preds[(m, 1)][key]; r2 = preds[(m, 2)][key]; r3 = preds[(m, 3)][key]
+            if not (r1["parse_ok"] and r2["parse_ok"] and r3["parse_ok"]):
+                continue
+            b = diff_bucket(r1["target_S1"])
+            p1 = set(r1["predicted_values"]); p2 = set(r2["predicted_values"]); p3 = set(r3["predicted_values"])
+            t = r3.get("target_solution")
+            per_bucket[b]["n"] += 1
+            per_bucket[b]["c13"] += int(bool(p3) and bool(p1) and p3.issubset(p1))
+            per_bucket[b]["c23"] += int(bool(p3) and bool(p2) and p3.issubset(p2))
+            per_bucket[b]["d13"] += int(bool(p3) and bool(p1) and not (p3 & p1))
+            per_bucket[b]["d23"] += int(bool(p3) and bool(p2) and not (p3 & p2))
+            per_bucket[b]["correct"] += int(t in p3 and len(p3) == 1)
+            per_bucket[b]["sum_size_s1"] += len(p1)
+            per_bucket[b]["sum_size_s2"] += len(p2)
+            per_bucket[b]["sum_size_s3"] += len(p3)
+        for b in DIFF_ORDER:
+            d = per_bucket[b]
+            n = d["n"]
+            rows.append({
+                "method": m, "bucket": b, "n": n,
+                "c13": _safe_div(d["c13"], n),
+                "c23": _safe_div(d["c23"], n),
+                "d13": _safe_div(d["d13"], n),
+                "d23": _safe_div(d["d23"], n),
+                "correct": _safe_div(d["correct"], n),
+                "size_s1": _safe_div(d["sum_size_s1"], n),
+                "size_s2": _safe_div(d["sum_size_s2"], n),
+                "size_s3": _safe_div(d["sum_size_s3"], n),
+            })
+    return rows
+def compute_correctness_breakdown(preds, common):
+    """When S3 prediction is WRONG, where did it land?"""
+    out = {}
+    for m in METHODS:
+        n_wrong = 0
+        wrong_in_s1 = 0
+        wrong_in_s2 = 0
+        wrong_disjoint_s1 = 0
+        wrong_disjoint_s2 = 0
+        n_correct = 0
+        for key in common:
+            r1 = preds[(m, 1)][key]; r2 = preds[(m, 2)][key]; r3 = preds[(m, 3)][key]
+            if not (r1["parse_ok"] and r2["parse_ok"] and r3["parse_ok"]):
+                continue
+            p1 = set(r1["predicted_values"]); p2 = set(r2["predicted_values"]); p3 = set(r3["predicted_values"])
+            t = r3["target_solution"]
+            cell_correct = (len(p3) == 1 and t in p3)
+            if cell_correct:
+                n_correct += 1
+                continue
+            n_wrong += 1
+            wrong_in_s1 += int(bool(p3) and bool(p1) and p3.issubset(p1))
+            wrong_in_s2 += int(bool(p3) and bool(p2) and p3.issubset(p2))
+            wrong_disjoint_s1 += int(bool(p3) and bool(p1) and not (p3 & p1))
+            wrong_disjoint_s2 += int(bool(p3) and bool(p2) and not (p3 & p2))
+        out[m] = {
+            "n_correct": n_correct,
+            "n_wrong": n_wrong,
+            "wrong_in_s1_frac": _safe_div(wrong_in_s1, n_wrong),
+            "wrong_in_s2_frac": _safe_div(wrong_in_s2, n_wrong),
+            "wrong_disjoint_s1_frac": _safe_div(wrong_disjoint_s1, n_wrong),
+            "wrong_disjoint_s2_frac": _safe_div(wrong_disjoint_s2, n_wrong),
+        }
+    return out
+def compute_method_agreement(preds, common):
+    """Frequency of ATC.S3 == DC.S3 stratified by ground-truth difficulty."""
+    per_bucket = {b: {"n": 0, "agree": 0, "atc_correct": 0, "dc_correct": 0} for b in DIFF_ORDER}
+    for key in common:
+        atc_r = preds[("atc", 3)][key]; dc_r = preds[("dc", 3)][key]
+        if not (atc_r["parse_ok"] and dc_r["parse_ok"]):
+            continue
+        ap = sorted(atc_r["predicted_values"]); dp = sorted(dc_r["predicted_values"])
+        b = diff_bucket(atc_r["target_S1"])
+        t = atc_r["target_solution"]
+        per_bucket[b]["n"] += 1
+        per_bucket[b]["agree"] += int(ap == dp)
+        per_bucket[b]["atc_correct"] += int(len(ap) == 1 and t in ap)
+        per_bucket[b]["dc_correct"] += int(len(dp) == 1 and t in dp)
+    return per_bucket
+# ----------------------------- PLOTS -----------------------------------
+def plot_containment_basic(metrics, out_path):
+    """Re-do the headline bar chart."""
+    fig, ax = plt.subplots(figsize=(5.4, 3.6), constrained_layout=True)
+    groups = [
+        ("$\\hat S_3 \\subseteq \\hat S_1$", "c13"),
+        ("$\\hat S_3 \\subseteq \\hat S_2$", "c23"),
+        ("$\\hat S_3 \\cap \\hat S_1=\\varnothing$", "d13"),
+        ("$\\hat S_3 \\cap \\hat S_2=\\varnothing$", "d23"),
+    ]
+    x = list(range(len(groups)))
+    w = 0.36
+    atc_vals = [metrics["atc"][k] for _, k in groups]
+    dc_vals = [metrics["dc"][k] for _, k in groups]
+    ax.bar([xi - w/2 for xi in x], atc_vals, w, color=ATC_COLOR, label="ATC", edgecolor="none")
+    ax.bar([xi + w/2 for xi in x], dc_vals,  w, color=DC_COLOR,  label="Data Curriculum", edgecolor="none")
+    for xi, v in zip(x, atc_vals):
+        ax.text(xi - w/2, v + 0.015, f"{v:.3f}", ha="center", va="bottom", fontsize=9, color=ATC_COLOR)
+    for xi, v in zip(x, dc_vals):
+        ax.text(xi + w/2, v + 0.015, f"{v:.3f}", ha="center", va="bottom", fontsize=9, color=DC_COLOR)
+    ax.set_xticks(x, [lbl for lbl, _ in groups])
+    ax.set_ylim(0, 1.06)
+    ax.set_ylabel("Fraction of cells")
+    ax.legend(frameon=False, loc="upper right")
+    fig.savefig(out_path.with_suffix(".pdf"), bbox_inches="tight")
+    fig.savefig(out_path.with_suffix(".png"), dpi=300, bbox_inches="tight")
+    plt.close(fig)
+def plot_containment_by_difficulty(rows, key, ylabel, out_path):
+    fig, ax = plt.subplots(figsize=(5.6, 3.6), constrained_layout=True)
+    by_m = {m: {r["bucket"]: r[key] for r in rows if r["method"] == m} for m in METHODS}
+    by_n = {m: {r["bucket"]: r["n"] for r in rows if r["method"] == m} for m in METHODS}
+    x = list(range(len(DIFF_ORDER)))
+    w = 0.36
+    atc_vals = [by_m["atc"].get(b, 0) for b in DIFF_ORDER]
+    dc_vals = [by_m["dc"].get(b, 0) for b in DIFF_ORDER]
+    ax.bar([xi - w/2 for xi in x], atc_vals, w, color=ATC_COLOR, label="ATC", edgecolor="none")
+    ax.bar([xi + w/2 for xi in x], dc_vals,  w, color=DC_COLOR,  label="Data Curriculum", edgecolor="none")
+    for xi, v in zip(x, atc_vals):
+        ax.text(xi - w/2, v + 0.01, f"{v:.2f}", ha="center", va="bottom", fontsize=8, color=ATC_COLOR)
+    for xi, v in zip(x, dc_vals):
+        ax.text(xi + w/2, v + 0.01, f"{v:.2f}", ha="center", va="bottom", fontsize=8, color=DC_COLOR)
+    # n-cells annotation under each group
+    for xi, b in zip(x, DIFF_ORDER):
+        n = by_n["atc"].get(b, 0)
+        ax.text(xi, -0.06, f"n={n}", ha="center", va="top", fontsize=8, color="0.4", transform=ax.get_xaxis_transform())
+    ax.set_xticks(x, DIFF_ORDER)
+    ax.set_ylim(0, 1.05)
+    ax.set_ylabel(ylabel)
+    ax.legend(frameon=False, loc="lower left")
+    fig.savefig(out_path.with_suffix(".pdf"), bbox_inches="tight")
+    fig.savefig(out_path.with_suffix(".png"), dpi=300, bbox_inches="tight")
+    plt.close(fig)
+def plot_set_size_trajectory(rows, out_path):
+    """Avg predicted set size across S1 → S2 → S3, per method."""
+    fig, ax = plt.subplots(figsize=(5.2, 3.6), constrained_layout=True)
+    # average across all buckets weighted by n
+    def avg(method, key):
+        ns = sum(r["n"] for r in rows if r["method"] == method)
+        s = sum(r[key] * r["n"] for r in rows if r["method"] == method)
+        return s / max(1, ns)
+    for m, marker, ls in [("atc", "s", "-"), ("dc", "o", "--")]:
+        y = [avg(m, "size_s1"), avg(m, "size_s2"), avg(m, "size_s3")]
+        ax.plot([1, 2, 3], y, color=COLOR[m], marker=marker, linestyle=ls, label=METHOD_PRETTY[m])
+        for xi, v in zip([1, 2, 3], y):
+            ax.text(xi, v + 0.03, f"{v:.2f}", ha="center", va="bottom", fontsize=9, color=COLOR[m])
+    ax.set_xticks([1, 2, 3], ["Stage 1", "Stage 2", "Stage 3"])
+    ax.set_ylim(0.95, 1.45)
+    ax.set_ylabel("Avg |predicted candidate set|")
+    ax.grid(True, axis="y", linestyle=":", linewidth=0.7, color="0.7", alpha=0.7)
+    ax.legend(frameon=False, loc="upper right")
+    fig.savefig(out_path.with_suffix(".pdf"), bbox_inches="tight")
+    fig.savefig(out_path.with_suffix(".png"), dpi=300, bbox_inches="tight")
+    plt.close(fig)
+def plot_correctness_breakdown(stats, out_path):
+    """Among WRONG S3 cells, what fraction stays in S1 or in S2?"""
+    fig, ax = plt.subplots(figsize=(5.6, 3.6), constrained_layout=True)
+    groups = [
+        ("Wrong but $\\subseteq \\hat S_1$", "wrong_in_s1_frac"),
+        ("Wrong but $\\subseteq \\hat S_2$", "wrong_in_s2_frac"),
+        ("Wrong & $\\cap \\hat S_1=\\varnothing$", "wrong_disjoint_s1_frac"),
+        ("Wrong & $\\cap \\hat S_2=\\varnothing$", "wrong_disjoint_s2_frac"),
+    ]
+    x = list(range(len(groups)))
+    w = 0.36
+    atc_vals = [stats["atc"][k] for _, k in groups]
+    dc_vals = [stats["dc"][k] for _, k in groups]
+    ax.bar([xi - w/2 for xi in x], atc_vals, w, color=ATC_COLOR,
+           label=f"ATC (n_wrong={stats['atc']['n_wrong']})", edgecolor="none")
+    ax.bar([xi + w/2 for xi in x], dc_vals,  w, color=DC_COLOR,
+           label=f"Data Curr. (n_wrong={stats['dc']['n_wrong']})", edgecolor="none")
+    for xi, v in zip(x, atc_vals):
+        ax.text(xi - w/2, v + 0.015, f"{v:.2f}", ha="center", va="bottom", fontsize=9, color=ATC_COLOR)
+    for xi, v in zip(x, dc_vals):
+        ax.text(xi + w/2, v + 0.015, f"{v:.2f}", ha="center", va="bottom", fontsize=9, color=DC_COLOR)
+    ax.set_xticks(x, [lbl for lbl, _ in groups])
+    ax.set_ylim(0, 1.05)
+    ax.set_ylabel("Fraction of wrong S3 cells")
+    ax.legend(frameon=False, loc="upper right")
+    fig.savefig(out_path.with_suffix(".pdf"), bbox_inches="tight")
+    fig.savefig(out_path.with_suffix(".png"), dpi=300, bbox_inches="tight")
+    plt.close(fig)
+def plot_method_agreement(per_bucket, out_path):
+    fig, ax = plt.subplots(figsize=(5.6, 3.6), constrained_layout=True)
+    x = list(range(len(DIFF_ORDER)))
+    w = 0.28
+    agree = [_safe_div(per_bucket[b]["agree"], per_bucket[b]["n"]) for b in DIFF_ORDER]
+    atc_ok = [_safe_div(per_bucket[b]["atc_correct"], per_bucket[b]["n"]) for b in DIFF_ORDER]
+    dc_ok  = [_safe_div(per_bucket[b]["dc_correct"],  per_bucket[b]["n"]) for b in DIFF_ORDER]
+    ax.bar([xi - w for xi in x], atc_ok, w, color=ATC_COLOR, label="ATC correct", edgecolor="none")
+    ax.bar([xi for xi in x],     dc_ok,  w, color=DC_COLOR,  label="DC correct",  edgecolor="none")
+    ax.bar([xi + w for xi in x], agree,  w, color="0.4",      label="ATC == DC",   edgecolor="none")
+    for xi, b in zip(x, DIFF_ORDER):
+        n = per_bucket[b]["n"]
+        ax.text(xi, -0.06, f"n={n}", ha="center", va="top", fontsize=8, color="0.4", transform=ax.get_xaxis_transform())
+    ax.set_xticks(x, DIFF_ORDER)
+    ax.set_ylim(0, 1.05)
+    ax.set_ylabel("Fraction")
+    ax.legend(frameon=False, loc="lower left")
+    fig.savefig(out_path.with_suffix(".pdf"), bbox_inches="tight")
+    fig.savefig(out_path.with_suffix(".png"), dpi=300, bbox_inches="tight")
+    plt.close(fig)
+# Re-use the simple sankey from analyze.py (lightly compacted)
+def plot_sankey(preds, out_path, puzzle_id=0):
+    fig, axes = plt.subplots(1, 2, figsize=(9, 4.6), constrained_layout=True)
+    for ax, method in zip(axes, ["atc", "dc"]):
+        cells = []
+        for key, r3 in sorted(preds[(method, 3)].items()):
+            if key[0] != puzzle_id:
+                continue
+            p1 = preds[(method, 1)].get(key, {}).get("predicted_values") or []
+            p2 = preds[(method, 2)].get(key, {}).get("predicted_values") or []
+            p3 = r3.get("predicted_values") or []
+            cells.append((key[1], p1, p2, p3, r3.get("target_solution")))
+        n = len(cells)
+        ax.set_xlim(0, 3); ax.set_ylim(-0.5, n - 0.5)
+        for i, (cell_rc, p1, p2, p3, gt) in enumerate(cells):
+            r, c = cell_rc
+            ax.text(-0.4, n - 1 - i, f"({r+1},{c+1})", va="center", ha="right", fontsize=8, color="0.4")
+            for x_center, vals in [(0.5, p1), (1.5, p2), (2.5, p3)]:
+                txt = ",".join(str(v) for v in vals) if vals else "—"
+                ax.text(x_center, n - 1 - i, txt, va="center", ha="center", fontsize=9)
+            ok = bool(p3 and p1 and set(p3).issubset(set(p1)))
+            color = "0.88" if ok else "#f5b7b1"
+            ax.axhspan(n - 1 - i - 0.5, n - 1 - i + 0.5, facecolor=color, alpha=0.4, zorder=0)
+        ax.set_xticks([0.5, 1.5, 2.5], ["S1", "S2", "S3"])
+        ax.set_yticks([])
+        ax.set_title(METHOD_PRETTY[method], fontsize=11)
+        ax.spines["left"].set_visible(False)
+    fig.savefig(out_path.with_suffix(".pdf"), bbox_inches="tight")
+    fig.savefig(out_path.with_suffix(".png"), dpi=300, bbox_inches="tight")
+    plt.close(fig)
+# ----------------------------- MAIN ------------------------------------
+def main():
+    p = argparse.ArgumentParser()
+    p.add_argument("--preds_dir", required=True)
+    p.add_argument("--out_dir", required=True)
+    p.add_argument("--example_puzzle", type=int, default=0)
+    args = p.parse_args()
+    preds_dir = Path(args.preds_dir); out = Path(args.out_dir)
+    out.mkdir(parents=True, exist_ok=True)
+    preds = load_preds(preds_dir)
+    common = cells_common(preds)
+    print(f"common cells: {len(common)}")
+    rows = compute_per_difficulty(preds, common)
+    aggregate = {m: {"c13": 0, "c23": 0, "d13": 0, "d23": 0, "n": 0} for m in METHODS}
+    for r in rows:
+        for k in ("c13", "c23", "d13", "d23"):
+            aggregate[r["method"]][k] += r[k] * r["n"]
+        aggregate[r["method"]]["n"] += r["n"]
+    for m in METHODS:
+        n = aggregate[m]["n"]
+        for k in ("c13", "c23", "d13", "d23"):
+            aggregate[m][k] = aggregate[m][k] / max(1, n)
+    correctness = compute_correctness_breakdown(preds, common)
+    agreement = compute_method_agreement(preds, common)
+    summary = {
+        "n_common_cells": len(common),
+        "aggregate": aggregate,
+        "per_difficulty": rows,
+        "correctness_breakdown": correctness,
+        "agreement_by_difficulty": {b: agreement[b] for b in DIFF_ORDER},
+    }
+    with open(out / "containment_summary_v2.json", "w") as f:
+        json.dump(summary, f, indent=2)
+    plot_containment_basic(aggregate, out / "fig_containment_basic")
+    plot_containment_by_difficulty(rows, "c13", "$P(\\hat S_3 \\subseteq \\hat S_1)$",
+                                   out / "fig_c13_by_diff")
+    plot_containment_by_difficulty(rows, "c23", "$P(\\hat S_3 \\subseteq \\hat S_2)$",
+                                   out / "fig_c23_by_diff")
+    plot_containment_by_difficulty(rows, "d23", "$P(\\hat S_3 \\cap \\hat S_2=\\varnothing)$",
+                                   out / "fig_d23_by_diff")
+    plot_containment_by_difficulty(rows, "correct", "Solve rate at S3",
+                                   out / "fig_solve_by_diff")
+    plot_set_size_trajectory(rows, out / "fig_set_size_trajectory")
+    plot_correctness_breakdown(correctness, out / "fig_correctness_breakdown")
+    plot_method_agreement(agreement, out / "fig_method_agreement")
+    plot_sankey(preds, out / "fig_sankey_example", puzzle_id=args.example_puzzle)
+    print(json.dumps(summary["aggregate"], indent=2))
+    print("agreement_by_difficulty:")
+    for b in DIFF_ORDER:
+        d = agreement[b]
+        if d["n"]:
+            print(f"  {b}: n={d['n']} agree={d['agree']/d['n']:.3f} "
+                  f"atc_correct={d['atc_correct']/d['n']:.3f} dc_correct={d['dc_correct']/d['n']:.3f}")
+    print("correctness_breakdown:")
+    print(json.dumps(correctness, indent=2))
+if __name__ == "__main__":
+    main()

_experiments/cross_stage/overnight_pipeline.sh ADDED Viewed

	@@ -0,0 +1,58 @@

+#!/usr/bin/env bash
+# Overnight orchestrator:
+#   1) wait for phase-1 cross-prompt jobs to finish (already launched)
+#   2) launch phase-2 cross-prompt sweep
+#   3) wait for phase-2 to finish
+#   4) run analyze_cross_prompt.py to produce all plots
+#   5) print a summary
+set -e
+REPO=/home/ubuntu/curriculum-cot-code
+LOG_DIR=/home/ubuntu/curriculum_cot/_experiments/cross_stage/logs_xprompt
+FIGS_DIR=/home/ubuntu/curriculum_cot/_experiments/cross_stage/figs_xprompt
+PY=/opt/pytorch/bin/python
+mkdir -p "$FIGS_DIR"
+PHASE1_TAGS=(atc_train3_prompt1 atc_train3_prompt2 dc_train3_prompt1 dc_train3_prompt2 atc_train2_prompt3)
+PHASE2_TAGS=(atc_train1_prompt2 atc_train1_prompt3 atc_train2_prompt1 dc_train1_prompt2 dc_train1_prompt3 dc_train2_prompt1 dc_train2_prompt3)
+wait_for_tags() {
+  local -n tags=$1
+  local need=${#tags[@]}
+  while true; do
+    local done_count=0
+    for tag in "${tags[@]}"; do
+      if grep -q "DONE cells=" "$LOG_DIR/$tag.log" 2>/dev/null; then
+        done_count=$((done_count+1))
+      fi
+    done
+    echo "[$(date +%T)] $done_count / $need done"
+    if [ "$done_count" -ge "$need" ]; then
+      break
+    fi
+    sleep 90
+  done
+}
+echo "[$(date +%T)] waiting for phase-1 cross-prompt jobs..."
+wait_for_tags PHASE1_TAGS
+echo "[$(date +%T)] phase 1 complete; launching phase 2"
+bash "$REPO/_experiments/cross_stage/run_cross_prompt_phase2.sh"
+echo "[$(date +%T)] phase 2 complete; running analyses"
+"$PY" "$REPO/_experiments/cross_stage/analyze_v2.py" \
+  --preds_dir /home/ubuntu/curriculum_cot/_experiments/cross_stage/preds \
+  --out_dir   /home/ubuntu/curriculum_cot/_experiments/cross_stage/figs \
+  --example_puzzle 0
+"$PY" "$REPO/_experiments/cross_stage/analyze_cross_prompt.py" \
+  --diag_dir    /home/ubuntu/curriculum_cot/_experiments/cross_stage/preds \
+  --xprompt_dir /home/ubuntu/curriculum_cot/_experiments/cross_stage/preds_xprompt \
+  --out_dir     "$FIGS_DIR"
+echo "[$(date +%T)] done. Figures in: $FIGS_DIR"
+ls -la "$FIGS_DIR"

_experiments/cross_stage/predict_one.py ADDED Viewed

	@@ -0,0 +1,211 @@

+"""Dump per-cell predictions for one (method, stage) checkpoint on a fixed eval set.
+For each empty cell of each puzzle in the eval JSONL, runs the given adapter and
+writes a JSON line with:
+    method_tag             : free-form id e.g. "atc_s1"
+    puzzle_id              : 0-based row index
+    target_cell            : [r, c] (0-based, matches `ex.target_cell`)
+    target_solution        : the unique true value at this cell
+    stage_prompted         : the stage_i argument passed to the prompt builder
+    predicted_values       : sorted list of ints in [1,9] parsed from model output
+    parse_ok / exact_set_match : booleans from score_prediction_text
+    target_S1 / S2 / S3    : the stage-1/2/3 consistent value sets for this cell
+                             (computed independently of the model so the
+                             post-processing script can compare across stages)
+For the latent (recurrent-hidden) checkpoints set `--latent_mode recurrent_hidden`
+and `--num_cot_tokens` to whatever value the model was trained at.
+For vanilla baseline checkpoints leave both at their defaults.
+"""
+from __future__ import annotations
+import argparse
+import json
+import os
+import sys
+import time
+from pathlib import Path
+import torch
+from transformers import AutoModelForCausalLM, AutoTokenizer
+REPO = Path(__file__).resolve().parents[2]
+if str(REPO) not in sys.path:
+    sys.path.insert(0, str(REPO))
+from aligned_cell_policy.shared_cell_policy import build_cell_examples_from_row
+from multi_output_cell_policy.prompt_builder import build_multi_output_cell_prompt
+from multi_output_cell_policy.rewards import score_prediction_text
+from multi_output_cell_policy.shared_multi_output_policy import (
+    make_solved_grid_from_row,
+    stage_i_consistent_values,
+)
+def parse_args():
+    p = argparse.ArgumentParser()
+    p.add_argument("--method_tag", required=True)
+    p.add_argument("--adapter_dir", required=True)
+    p.add_argument("--eval_jsonl", required=True)
+    p.add_argument("--eval_rows", type=int, default=100)
+    p.add_argument("--stage_i", type=int, required=True)
+    p.add_argument("--total_empties_hint", type=int, default=20)
+    p.add_argument("--latent_mode", default="none",
+                   choices=["none", "recurrent_hidden", "fixed_slots", "latent_seeds", "residual"])
+    p.add_argument("--num_cot_tokens", type=int, default=0)
+    p.add_argument("--model_name", default="Qwen/Qwen2.5-1.5B-Instruct")
+    p.add_argument("--cache_dir", default=str(REPO / ".hf_cache"))
+    p.add_argument("--gpu_id", type=int, default=0)
+    p.add_argument("--max_completion_length", type=int, default=24)
+    p.add_argument("--out_jsonl", required=True)
+    return p.parse_args()
+def load_jsonl(path: str, limit: int):
+    out = []
+    with open(path) as f:
+        for line in f:
+            line = line.strip()
+            if not line:
+                continue
+            out.append(json.loads(line))
+            if len(out) >= limit:
+                break
+    return out
+def main():
+    args = parse_args()
+    os.makedirs(os.path.dirname(args.out_jsonl) or ".", exist_ok=True)
+    os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
+    device = torch.device(f"cuda:{args.gpu_id}" if torch.cuda.is_available() else "cpu")
+    tokenizer = AutoTokenizer.from_pretrained(
+        args.model_name, cache_dir=args.cache_dir, use_fast=True
+    )
+    if tokenizer.pad_token_id is None:
+        tokenizer.pad_token = tokenizer.eos_token or "<|endoftext|>"
+    base = AutoModelForCausalLM.from_pretrained(
+        args.model_name, cache_dir=args.cache_dir,
+        torch_dtype=torch.bfloat16, low_cpu_mem_usage=True,
+    )
+    is_latent = args.latent_mode != "none"
+    if is_latent:
+        from latent_multi_output_cell_policy.grpo_residual_projector_latent_train import (
+            load_trainable_adapter,
+            sample_recurrent_hidden_completion,
+        )
+        model = load_trainable_adapter(
+            base, args.adapter_dir,
+            lora_r=32, lora_alpha=64, lora_dropout=0.05,
+        )
+        if args.latent_mode != "recurrent_hidden":
+            raise SystemExit(f"Only recurrent_hidden latent_mode is wired up here; got {args.latent_mode!r}")
+        sample_fn = sample_recurrent_hidden_completion
+    else:
+        from peft import PeftModel
+        model = PeftModel.from_pretrained(base, args.adapter_dir, is_trainable=False)
+        sample_fn = None
+    if hasattr(model, "config"):
+        model.config.use_cache = True
+    model.to(device).eval()
+    rows = load_jsonl(args.eval_jsonl, args.eval_rows)
+    t0 = time.time()
+    n_cells = 0
+    with open(args.out_jsonl, "w") as fout:
+        for puzzle_id, row in enumerate(rows):
+            solved = make_solved_grid_from_row(row)
+            for ex in build_cell_examples_from_row(row):
+                prompt = build_multi_output_cell_prompt(
+                    ex.grid,
+                    target_cell=ex.target_cell,
+                    stage_i=args.stage_i,
+                    tokenizer=tokenizer,
+                    turn_idx=ex.turn_idx,
+                    total_turns=ex.total_turns,
+                    prev_output_flag=None,
+                    total_empties_hint=args.total_empties_hint,
+                )
+                enc = tokenizer(prompt, return_tensors="pt", add_special_tokens=False)
+                input_ids = enc["input_ids"].to(device)
+                attn = enc["attention_mask"].to(device)
+                with torch.no_grad():
+                    if is_latent:
+                        completion_ids = sample_fn(
+                            model, tokenizer, input_ids, attn,
+                            num_cot_tokens=int(args.num_cot_tokens),
+                            max_new_tokens=max(1, int(args.max_completion_length)),
+                            do_sample=False,
+                        )
+                        pred_text = tokenizer.decode(
+                            completion_ids[0], skip_special_tokens=True
+                        ).strip()
+                    else:
+                        out = model.generate(
+                            input_ids=input_ids,
+                            attention_mask=attn,
+                            max_new_tokens=max(1, int(args.max_completion_length)),
+                            do_sample=False,
+                            eos_token_id=tokenizer.eos_token_id,
+                            pad_token_id=tokenizer.pad_token_id,
+                        )
+                        pred_text = tokenizer.decode(
+                            out[0][input_ids.shape[1]:], skip_special_tokens=True
+                        ).strip()
+                info = score_prediction_text(
+                    text=pred_text,
+                    grid=ex.grid,
+                    solved=solved,
+                    target_cell=ex.target_cell,
+                    stage_i=args.stage_i,
+                    reward_good_value=1.0,
+                    penalty_bad_value=1.0,
+                    penalty_malformed=4.0,
+                    penalty_empty=0.5,
+                    penalty_singleton=1.5,
+                )
+                t1 = sorted(int(v) for v in stage_i_consistent_values(ex.grid, target_cell=ex.target_cell, stage_i=1))
+                t2 = sorted(int(v) for v in stage_i_consistent_values(ex.grid, target_cell=ex.target_cell, stage_i=2))
+                t3 = sorted(int(v) for v in stage_i_consistent_values(ex.grid, target_cell=ex.target_cell, stage_i=3))
+                pred_values_raw = info.get("predicted_values") or []
+                predicted_values = sorted(int(v) for v in pred_values_raw if isinstance(v, (int, float)))
+                rec = {
+                    "method_tag": args.method_tag,
+                    "puzzle_id": int(puzzle_id),
+                    "target_cell": [int(ex.target_cell[0]), int(ex.target_cell[1])],
+                    "target_solution": int(ex.target_value),
+                    "stage_prompted": int(args.stage_i),
+                    "predicted_values": predicted_values,
+                    "predicted_text": pred_text,
+                    "parse_ok": bool(info["parse_ok"]),
+                    "exact_set_match": bool(info["exact_set_match"]),
+                    "target_S1": t1,
+                    "target_S2": t2,
+                    "target_S3": t3,
+                }
+                fout.write(json.dumps(rec) + "\n")
+                n_cells += 1
+            if (puzzle_id + 1) % 10 == 0:
+                print(
+                    f"[{args.method_tag}] puzzle {puzzle_id+1}/{len(rows)} "
+                    f"cells={n_cells} elapsed={time.time()-t0:.0f}s",
+                    flush=True,
+                )
+    print(f"[{args.method_tag}] DONE cells={n_cells} elapsed={time.time()-t0:.0f}s out={args.out_jsonl}")
+if __name__ == "__main__":
+    main()

_experiments/cross_stage/run_all.sh ADDED Viewed

	@@ -0,0 +1,58 @@

+#!/usr/bin/env bash
+# Run the 6-way cross-stage prediction sweep in parallel on GPUs 0-5.
+# Each job writes /home/ubuntu/curriculum_cot/_experiments/cross_stage/preds/<tag>.jsonl
+set -e
+REPO=/home/ubuntu/curriculum-cot-code
+EVAL=/home/ubuntu/curriculum_cot/data/sudoku_t3_20empty_value_qwen_text_stage1_eval.jsonl
+EVAL_ROWS=${EVAL_ROWS:-100}
+OUT_DIR=${OUT_DIR:-/home/ubuntu/curriculum_cot/_experiments/cross_stage/preds}
+LOG_DIR=${LOG_DIR:-/home/ubuntu/curriculum_cot/_experiments/cross_stage/logs}
+mkdir -p "$OUT_DIR" "$LOG_DIR"
+PY=/opt/pytorch/bin/python
+SCRIPT="$REPO/_experiments/cross_stage/predict_one.py"
+# (tag, gpu, adapter_dir, stage_i, latent_mode, num_cot)
+declare -a JOBS=(
+  "atc_s1|0|/home/ubuntu/hf_checkpoints/latent_stages/stage01_latent_grpo_i1_20empty_latent_recurrent_hidden|1|recurrent_hidden|1"
+  "atc_s2|1|/home/ubuntu/hf_checkpoints/latent_stages/grpo/N3_from_main_step800/checkpoint-200|2|recurrent_hidden|3"
+  "atc_s3|2|/home/ubuntu/hf_checkpoints/latent_stages/rebuttal_champion_100p/s3_grpo_baseline_checkpoint-200|3|recurrent_hidden|3"
+  "dc_s1|3|/home/ubuntu/hf_checkpoints/baseline/baseline_lr1e4/s1_grpo_v2|1|none|0"
+  "dc_s2|4|/home/ubuntu/hf_checkpoints/baseline/baseline_lr5e5_lowsft_v3/s2_sft_v3/checkpoint-step-03000|2|none|0"
+  "dc_s3|5|/home/ubuntu/hf_checkpoints/baseline/v6_i_sft_v_oversample10/s3_sft/checkpoint-step-00200|3|none|0"
+)
+PIDS=()
+for entry in "${JOBS[@]}"; do
+  IFS='|' read -r tag gpu adapter stage_i mode cot <<< "$entry"
+  echo "[$(date +%T)] launching $tag on GPU $gpu (stage_i=$stage_i mode=$mode cot=$cot)"
+  CUDA_VISIBLE_DEVICES="$gpu" "$PY" "$SCRIPT" \
+    --method_tag "$tag" \
+    --adapter_dir "$adapter" \
+    --eval_jsonl "$EVAL" \
+    --eval_rows "$EVAL_ROWS" \
+    --stage_i "$stage_i" \
+    --latent_mode "$mode" \
+    --num_cot_tokens "$cot" \
+    --gpu_id 0 \
+    --out_jsonl "$OUT_DIR/$tag.jsonl" \
+    > "$LOG_DIR/$tag.log" 2>&1 &
+  PIDS+=("$!")
+done
+echo "Launched 6 jobs with PIDs: ${PIDS[*]}"
+echo "Logs: $LOG_DIR"
+echo "Outputs: $OUT_DIR"
+echo
+echo "Waiting for all to finish..."
+fail=0
+for pid in "${PIDS[@]}"; do
+  if wait "$pid"; then
+    echo "  pid $pid OK"
+  else
+    echo "  pid $pid FAILED"
+    fail=$((fail + 1))
+  fi
+done
+echo "Done. $fail failures."

_experiments/cross_stage/run_cross_prompt.sh ADDED Viewed

	@@ -0,0 +1,66 @@

+#!/usr/bin/env bash
+# Cross-prompt sweep: pair every checkpoint with an OFF-DIAGONAL stage_i prompt.
+#
+# Idea: predict_one.py loads (method, train_stage) adapter and prompts it with
+# any stage_i. If latent CoT preserves cross-stage information, the latent S3
+# adapter should still be able to enumerate the S1 candidate set when asked,
+# while the data-curriculum S3 adapter has "overwritten" that capability.
+#
+# Launches 6 jobs in parallel on GPUs 1..5 (and re-uses 0 for analysis later).
+# Output:
+#   /home/ubuntu/curriculum_cot/_experiments/cross_stage/preds_xprompt/<tag>.jsonl
+#   /home/ubuntu/curriculum_cot/_experiments/cross_stage/logs_xprompt/<tag>.log
+set -e
+REPO=/home/ubuntu/curriculum-cot-code
+EVAL=/home/ubuntu/curriculum_cot/data/sudoku_t3_20empty_value_qwen_text_stage1_eval.jsonl
+EVAL_ROWS=${EVAL_ROWS:-200}
+OUT_DIR=${OUT_DIR:-/home/ubuntu/curriculum_cot/_experiments/cross_stage/preds_xprompt}
+LOG_DIR=${LOG_DIR:-/home/ubuntu/curriculum_cot/_experiments/cross_stage/logs_xprompt}
+mkdir -p "$OUT_DIR" "$LOG_DIR"
+PY=/opt/pytorch/bin/python
+SCRIPT="$REPO/_experiments/cross_stage/predict_one.py"
+# Adapters re-used from the diagonal sweep
+ATC_S1=/home/ubuntu/hf_checkpoints/latent_stages/stage01_latent_grpo_i1_20empty_latent_recurrent_hidden
+ATC_S2=/home/ubuntu/hf_checkpoints/latent_stages/grpo/N3_from_main_step800/checkpoint-200
+ATC_S3=/home/ubuntu/hf_checkpoints/latent_stages/rebuttal_champion_100p/s3_grpo_baseline_checkpoint-200
+DC_S1=/home/ubuntu/hf_checkpoints/baseline/baseline_lr1e4/s1_grpo_v2
+DC_S2=/home/ubuntu/hf_checkpoints/baseline/baseline_lr5e5_lowsft_v3/s2_sft_v3/checkpoint-step-03000
+DC_S3=/home/ubuntu/hf_checkpoints/baseline/v6_i_sft_v_oversample10/s3_sft/checkpoint-step-00200
+# Each row: tag | gpu | adapter_dir | prompt_stage_i | latent_mode | num_cot
+#   - "tag" embeds the (train_stage, prompt_stage) pair so analyze script
+#     can pick them up automatically.
+declare -a JOBS=(
+  # forward-compat: S3 model asked to do S1 / S2 enumeration
+  "atc_train3_prompt1|1|$ATC_S3|1|recurrent_hidden|3"
+  "atc_train3_prompt2|2|$ATC_S3|2|recurrent_hidden|3"
+  "dc_train3_prompt1|3|$DC_S3|1|none|0"
+  "dc_train3_prompt2|4|$DC_S3|2|none|0"
+  # backward-compat: S1 model asked to commit (do S3); also S2->S3
+  "atc_train2_prompt3|5|$ATC_S2|3|recurrent_hidden|3"
+)
+PIDS=()
+for entry in "${JOBS[@]}"; do
+  IFS='|' read -r tag gpu adapter stage_i mode cot <<< "$entry"
+  echo "[$(date +%T)] launching $tag on GPU $gpu (prompt stage_i=$stage_i, mode=$mode, cot=$cot)"
+  CUDA_VISIBLE_DEVICES="$gpu" "$PY" "$SCRIPT" \
+    --method_tag "$tag" \
+    --adapter_dir "$adapter" \
+    --eval_jsonl "$EVAL" \
+    --eval_rows "$EVAL_ROWS" \
+    --stage_i "$stage_i" \
+    --latent_mode "$mode" \
+    --num_cot_tokens "$cot" \
+    --gpu_id 0 \
+    --out_jsonl "$OUT_DIR/$tag.jsonl" \
+    > "$LOG_DIR/$tag.log" 2>&1 &
+  PIDS+=("$!")
+done
+echo "Launched ${#PIDS[@]} cross-prompt jobs: ${PIDS[*]}"
+echo "Logs: $LOG_DIR"
+echo "Outputs: $OUT_DIR"

_experiments/cross_stage/run_cross_prompt_phase2.sh ADDED Viewed

	@@ -0,0 +1,93 @@

+#!/usr/bin/env bash
+# Phase-2 cross-prompt sweep: the remaining off-diagonals.
+#
+# Phase 1 covered:
+#   atc_train3_prompt1, atc_train3_prompt2,
+#   dc_train3_prompt1,  dc_train3_prompt2,
+#   atc_train2_prompt3.
+#
+# Phase 2 fills in:
+#   atc_train1_prompt2, atc_train1_prompt3,
+#   atc_train2_prompt1,
+#   dc_train1_prompt2,  dc_train1_prompt3,
+#   dc_train2_prompt1,  dc_train2_prompt3.
+#
+# 7 jobs across GPUs 0-5 + 7 (GPU 6 stays on long no-curr+CoT trainer).
+# We use GPUs 0..5 + share GPU 7 with the surviving k=3 trainer (it has
+# plenty of headroom; H100s have ~80GB).
+set -e
+REPO=/home/ubuntu/curriculum-cot-code
+EVAL=/home/ubuntu/curriculum_cot/data/sudoku_t3_20empty_value_qwen_text_stage1_eval.jsonl
+EVAL_ROWS=${EVAL_ROWS:-100}
+OUT_DIR=${OUT_DIR:-/home/ubuntu/curriculum_cot/_experiments/cross_stage/preds_xprompt}
+LOG_DIR=${LOG_DIR:-/home/ubuntu/curriculum_cot/_experiments/cross_stage/logs_xprompt}
+mkdir -p "$OUT_DIR" "$LOG_DIR"
+PY=/opt/pytorch/bin/python
+SCRIPT="$REPO/_experiments/cross_stage/predict_one.py"
+ATC_S1=/home/ubuntu/hf_checkpoints/latent_stages/stage01_latent_grpo_i1_20empty_latent_recurrent_hidden
+ATC_S2=/home/ubuntu/hf_checkpoints/latent_stages/grpo/N3_from_main_step800/checkpoint-200
+DC_S1=/home/ubuntu/hf_checkpoints/baseline/baseline_lr1e4/s1_grpo_v2
+DC_S2=/home/ubuntu/hf_checkpoints/baseline/baseline_lr5e5_lowsft_v3/s2_sft_v3/checkpoint-step-03000
+DC_S3=/home/ubuntu/hf_checkpoints/baseline/v6_i_sft_v_oversample10/s3_sft/checkpoint-step-00200
+declare -a JOBS=(
+  "atc_train1_prompt2|1|$ATC_S1|2|recurrent_hidden|1"
+  "atc_train1_prompt3|2|$ATC_S1|3|recurrent_hidden|1"
+  "atc_train2_prompt1|3|$ATC_S2|1|recurrent_hidden|3"
+  "dc_train1_prompt2|4|$DC_S1|2|none|0"
+  "dc_train1_prompt3|5|$DC_S1|3|none|0"
+)
+PIDS=()
+for entry in "${JOBS[@]}"; do
+  IFS='|' read -r tag gpu adapter stage_i mode cot <<< "$entry"
+  if [ -f "$OUT_DIR/$tag.jsonl" ] && grep -q "DONE cells=" "$LOG_DIR/$tag.log" 2>/dev/null; then
+    echo "[$(date +%T)] $tag already done, skip"
+    continue
+  fi
+  echo "[$(date +%T)] launching $tag on GPU $gpu (prompt stage_i=$stage_i, mode=$mode, cot=$cot)"
+  CUDA_VISIBLE_DEVICES="$gpu" "$PY" "$SCRIPT" \
+    --method_tag "$tag" \
+    --adapter_dir "$adapter" \
+    --eval_jsonl "$EVAL" \
+    --eval_rows "$EVAL_ROWS" \
+    --stage_i "$stage_i" \
+    --latent_mode "$mode" \
+    --num_cot_tokens "$cot" \
+    --gpu_id 0 \
+    --out_jsonl "$OUT_DIR/$tag.jsonl" \
+    > "$LOG_DIR/$tag.log" 2>&1 &
+  PIDS+=("$!")
+done
+wait
+echo "Phase 2 complete: ${#PIDS[@]} jobs"
+# Phase 3: 2 more dc_train2 jobs on GPUs 1-2
+declare -a JOBS3=(
+  "dc_train2_prompt1|1|$DC_S2|1|none|0"
+  "dc_train2_prompt3|2|$DC_S2|3|none|0"
+)
+PIDS3=()
+for entry in "${JOBS3[@]}"; do
+  IFS='|' read -r tag gpu adapter stage_i mode cot <<< "$entry"
+  if [ -f "$OUT_DIR/$tag.jsonl" ] && grep -q "DONE cells=" "$LOG_DIR/$tag.log" 2>/dev/null; then continue; fi
+  echo "[$(date +%T)] launching $tag on GPU $gpu"
+  CUDA_VISIBLE_DEVICES="$gpu" "$PY" "$SCRIPT" \
+    --method_tag "$tag" \
+    --adapter_dir "$adapter" \
+    --eval_jsonl "$EVAL" \
+    --eval_rows "$EVAL_ROWS" \
+    --stage_i "$stage_i" \
+    --latent_mode "$mode" \
+    --num_cot_tokens "$cot" \
+    --gpu_id 0 \
+    --out_jsonl "$OUT_DIR/$tag.jsonl" \
+    > "$LOG_DIR/$tag.log" 2>&1 &
+  PIDS3+=("$!")
+done
+wait
+echo "All cross-prompt sweeps done."

_experiments/cross_stage/run_nocurr_cot.sh ADDED Viewed

	@@ -0,0 +1,79 @@

+#!/usr/bin/env bash
+# Long "No Curriculum + Latent CoT" SFT runs.
+#
+# Each variant trains the latent (recurrent-hidden) model directly on the
+# Stage-3 target labels (--stage_i 3) with no curriculum --- only the number
+# of latent CoT tokens (`num_cot_tokens`) varies between variants.  This is
+# the "no curriculum but with latent thoughts" cell of the factorial.
+#
+# Variants are warm-started from the well-trained k=0 checkpoint from the
+# previous adaptive-k sweep, then trained for many SFT steps so the model
+# has time to make use of the extra latent capacity.  Each variant uses
+# ALL 10000 training rows.
+#
+# Usage:
+#   bash run_nocurr_cot.sh "<GPU,VARIANT_TAG,NUM_COT,LR,OVERSAMPLE>" ...
+# Example:
+#   bash run_nocurr_cot.sh \
+#     "6,nocurr_cot_k2_lr2e5_o5,2,2e-5,5" \
+#     "7,nocurr_cot_k3_lr2e5_o5,3,2e-5,5"
+#
+set -e
+REPO=/home/ubuntu/curriculum-cot-code
+SFT_PY="$REPO/latent_multi_output_cell_policy/sft_latent_multi_output_train.py"
+TRAIN=/home/ubuntu/curriculum_cot/data/sudoku_t3_20empty_value_qwen_text_stage1_train.jsonl
+EVAL=/home/ubuntu/curriculum_cot/data/sudoku_t3_20empty_value_qwen_text_stage1_eval.jsonl
+INIT_ADAPTER=/home/ubuntu/hf_checkpoints/adaptive_k/20260525_024629/adaptive_a_eps01/sft_phase02_k0/checkpoint-step-00600
+OUT_ROOT=/home/ubuntu/curriculum_cot/_runs/nocurr_cot_$(date +%Y%m%d_%H%M%S)
+mkdir -p "$OUT_ROOT"
+echo "OUT_ROOT=$OUT_ROOT"
+PY=/opt/pytorch/bin/python
+PIDS=()
+for spec in "$@"; do
+  IFS=',' read -r gpu tag cot lr oversample <<< "$spec"
+  out="$OUT_ROOT/$tag"
+  mkdir -p "$out"
+  echo "[$(date +%T)] launching $tag on GPU $gpu (num_cot=$cot lr=$lr oversample=$oversample)"
+  CUDA_VISIBLE_DEVICES="$gpu" nohup "$PY" -u "$SFT_PY" \
+    --model_name Qwen/Qwen2.5-1.5B-Instruct \
+    --train_jsonl "$TRAIN" \
+    --eval_jsonl "$EVAL" \
+    --output_dir "$out" \
+    --cache_dir "$REPO/.hf_cache" \
+    --init_adapter_dir "$INIT_ADAPTER" \
+    --seed 0 \
+    --gpu_id 0 \
+    --stage_i 3 \
+    --num_cot_tokens "$cot" \
+    --latent_mode recurrent_hidden \
+    --total_empties_hint 20 \
+    --per_device_train_batch_size 8 \
+    --gradient_accumulation_steps 4 \
+    --num_epochs 256 \
+    --learning_rate "$lr" \
+    --max_grad_norm 1.0 \
+    --logging_steps 25 \
+    --eval_steps 250 \
+    --save_steps 250 \
+    --eval_rows 100 \
+    --max_completion_length 24 \
+    --limit_train_rows 10000 \
+    --lora_r 32 --lora_alpha 64 --lora_dropout 0.05 \
+    --multi_value_oversample_factor "$oversample" \
+    --train_target_size_min 0 --train_target_size_max 0 \
+    --eval_value_precision_stop 0 \
+    --eval_value_recall_stop 0 \
+    --eval_exact_set_match_stop 0 \
+    --eval_solve_rate_stop 0 \
+    --min_steps_before_stop 1000000 \
+    --max_wall_clock_seconds 0 \
+    --max_steps 3000 \
+    --enable_gradient_checkpointing \
+    > "$out/train.log" 2>&1 &
+  PIDS+=("$!")
+done
+echo "Launched ${#PIDS[@]} jobs: ${PIDS[*]}"
+echo "$OUT_ROOT"

_experiments/cross_stage/watcher_launch_more.sh ADDED Viewed

	@@ -0,0 +1,32 @@

+#!/usr/bin/env bash
+# Wait for the 6 cross-stage prediction jobs to finish, then launch 6 more
+# long no-curr+CoT variants on the freed GPUs (0..5).
+#
+# Run with nohup.
+LOG_DIR=/home/ubuntu/curriculum_cot/_experiments/cross_stage/logs
+SCRIPT=/home/ubuntu/curriculum-cot-code/_experiments/cross_stage/run_nocurr_cot.sh
+echo "[$(date +%T)] waiting for the 6 cross-stage jobs to finish..."
+while true; do
+  done_count=0
+  for tag in atc_s1 atc_s2 atc_s3 dc_s1 dc_s2 dc_s3; do
+    if grep -q "DONE cells=" "$LOG_DIR/$tag.log" 2>/dev/null; then
+      done_count=$((done_count + 1))
+    fi
+  done
+  if [ "$done_count" -ge 6 ]; then
+    break
+  fi
+  sleep 60
+done
+echo "[$(date +%T)] cross-stage done; launching 6 more no-curr+CoT variants on GPUs 0..5"
+bash "$SCRIPT" \
+  "0,nocurr_cot_k1_lr2e5_o5,1,2e-5,5" \
+  "1,nocurr_cot_k2_lr1e5_o5,2,1e-5,5" \
+  "2,nocurr_cot_k3_lr1e5_o5,3,1e-5,5" \
+  "3,nocurr_cot_k2_lr2e5_o10,2,2e-5,10" \
+  "4,nocurr_cot_k3_lr2e5_o10,3,2e-5,10" \
+  "5,nocurr_cot_k3_lr5e5_o5,3,5e-5,5"
+echo "[$(date +%T)] 6 more launched."

_runs/_paper_figures/plot_stage_progression.py CHANGED Viewed

@@ -1,5 +1,12 @@
-"""Paper-style figures: Solve rate + Per-cell exact across stages.  Two separate figures,
-no titles, no footer text — only axes, lines, markers, legend.
 """
 from __future__ import annotations
@@ -13,11 +20,40 @@ OUT_DIR = Path(__file__).resolve().parent
 # -------------------------------- DATA ---------------------------------------
 STAGES = ["Stage 1", "Stage 2", "Stage 3"]
-LATENT_SOLVE   = [0.70, 0.50, 0.58]
-LATENT_EXACT   = [0.95, 0.958, 0.967]
-BASELINE_SOLVE = [0.78, 0.40, 0.44]
-BASELINE_EXACT = [0.988, 0.88, 0.83]
 # -------------------------------- STYLE --------------------------------------
 mpl.rcParams.update({
@@ -41,30 +77,36 @@ mpl.rcParams.update({
     "ps.fonttype": 42,
 })
-LATENT_COLOR   = "#1f4f8b"
-BASELINE_COLOR = "#b21e2f"
 GRID_KW = dict(linestyle=":", linewidth=0.7, color="0.7", alpha=0.7)
 x = list(range(len(STAGES)))
-def _plot(y_latent, y_baseline, ylim, yticks, ylabel, fname):
     fig, ax = plt.subplots(figsize=(4.6, 3.4), constrained_layout=True)
     ax.plot(
-        x, y_latent,
-        color=LATENT_COLOR, marker="s", linestyle="-",
-        label="Latent (recurrent-hidden)",
     )
     ax.plot(
-        x, y_baseline,
-        color=BASELINE_COLOR, marker="o", linestyle="--",
-        label="Baseline (vanilla 1.5B)",
     )
     ax.set_xticks(x, STAGES)
     ax.set_ylim(*ylim)
     ax.set_yticks(yticks)
     ax.set_ylabel(ylabel)
     ax.grid(True, axis="y", **GRID_KW)
-    ax.legend(frameon=False, loc="best")
     fig.savefig(OUT_DIR / f"{fname}.pdf", bbox_inches="tight")
     fig.savefig(OUT_DIR / f"{fname}.png", dpi=300, bbox_inches="tight")
     plt.close(fig)
@@ -72,16 +114,34 @@ def _plot(y_latent, y_baseline, ylim, yticks, ylabel, fname):
 _plot(
-    LATENT_SOLVE, BASELINE_SOLVE,
     ylim=(0.0, 1.0),
     yticks=[0.0, 0.2, 0.4, 0.6, 0.8, 1.0],
     ylabel="Solve rate",
     fname="stage_progression_solve",
 )
 _plot(
-    LATENT_EXACT, BASELINE_EXACT,
-    ylim=(0.78, 1.00),
-    yticks=[0.80, 0.84, 0.88, 0.92, 0.96, 1.00],
     ylabel="Per-cell set-match rate",
     fname="stage_progression_exact",
 )

+"""Paper-style figures: Solve rate / Per-cell exact / Value precision / Value recall
+across the three curriculum stages.  Four separate figures, no titles, no footer
+text — only axes, lines, markers, legend.
+Three series in every figure:
+  ATC                 — latent recurrent-hidden, stage-curriculum (S1 / S2 / S3)
+  Data Curriculum     — vanilla 1.5B, stage-curriculum (S1 / S2 / S3)
+  No CoT, No Curr.    — vanilla 1.5B trained on the Stage-3 task only,
+                        no thought tokens, no curriculum (horizontal reference)
 """
 from __future__ import annotations
 # -------------------------------- DATA ---------------------------------------
 STAGES = ["Stage 1", "Stage 2", "Stage 3"]
+# Solve rate
+ATC_SOLVE   = [0.70, 0.50, 0.58]
+DC_SOLVE    = [0.78, 0.40, 0.44]
+NOCURR_SOLVE = 0.33
+# Per-cell exact set-match
+ATC_EXACT   = [0.95, 0.958, 0.967]
+DC_EXACT    = [0.988, 0.88, 0.83]
+NOCURR_EXACT = 0.80
+# Value precision
+#   ATC S1: approximated (Stage-1 latent SFT/GRPO log only stores reward;
+#           Stage-1 GRPO converged with solve≈0.95 on 40p eval → prec~0.96).
+#   ATC S2: STAGE12_TRAJECTORY.md, step 2600 (best per-cell): prec=0.960.
+#   ATC S3: headtohead_s3/s3_grpo_baseline step200: prec=0.967.
+#   DC  S1: baseline_lr1e4/s1_grpo_v2 (solve 0.78):       prec=0.996.
+#   DC  S2: baseline_lr5e5_lowsft_v3/s2_sft_v3 step 3000: prec=0.911.
+#   DC  S3: baseline v6_i_sft_v_oversample10/s3_sft step 200: prec=0.955.
+#   NoCurr: strawman_warm_e (lr=1e-5, oversample=5) SFT-end: prec=0.945.
+ATC_PREC    = [0.96, 0.960, 0.967]
+DC_PREC     = [0.996, 0.911, 0.955]
+NOCURR_PREC = 0.945
+# Value recall
+#   ATC S1: approximated (see prec note); rec~0.96.
+#   ATC S2: STAGE12_TRAJECTORY.md, step 2600: rec=0.949.
+#   ATC S3: headtohead_s3/s3_grpo_baseline step200: rec=0.968.
+#   DC  S1: baseline_lr1e4/s1_grpo_v2: rec=0.998.
+#   DC  S2: baseline_lr5e5_lowsft_v3/s2_sft_v3 step 3000: rec=0.931.
+#   DC  S3: baseline v6_i_sft_v_oversample10/s3_sft step 200: rec=0.954.
+#   NoCurr: strawman_warm_e SFT-end: rec=0.944.
+ATC_REC     = [0.96, 0.949, 0.968]
+DC_REC      = [0.998, 0.931, 0.954]
+NOCURR_REC  = 0.944
 # -------------------------------- STYLE --------------------------------------
 mpl.rcParams.update({
     "ps.fonttype": 42,
 })
+ATC_COLOR    = "#1f4f8b"
+DC_COLOR     = "#b21e2f"
+NOCURR_COLOR = "#3a7d3a"
 GRID_KW = dict(linestyle=":", linewidth=0.7, color="0.7", alpha=0.7)
 x = list(range(len(STAGES)))
+def _plot(y_atc, y_dc, y_nocurr, ylim, yticks, ylabel, fname, legend_loc):
     fig, ax = plt.subplots(figsize=(4.6, 3.4), constrained_layout=True)
     ax.plot(
+        x, y_atc,
+        color=ATC_COLOR, marker="s", linestyle="-",
+        label="ATC",
     )
     ax.plot(
+        x, y_dc,
+        color=DC_COLOR, marker="o", linestyle="--",
+        label="Data Curriculum",
+    )
+    ax.axhline(
+        y=y_nocurr,
+        color=NOCURR_COLOR, linestyle=":", linewidth=2.0,
+        label="No CoT, No Curriculum",
     )
     ax.set_xticks(x, STAGES)
     ax.set_ylim(*ylim)
     ax.set_yticks(yticks)
     ax.set_ylabel(ylabel)
     ax.grid(True, axis="y", **GRID_KW)
+    ax.legend(frameon=False, loc=legend_loc)
     fig.savefig(OUT_DIR / f"{fname}.pdf", bbox_inches="tight")
     fig.savefig(OUT_DIR / f"{fname}.png", dpi=300, bbox_inches="tight")
     plt.close(fig)
 _plot(
+    ATC_SOLVE, DC_SOLVE, NOCURR_SOLVE,
     ylim=(0.0, 1.0),
     yticks=[0.0, 0.2, 0.4, 0.6, 0.8, 1.0],
     ylabel="Solve rate",
     fname="stage_progression_solve",
+    legend_loc="upper right",
 )
 _plot(
+    ATC_EXACT, DC_EXACT, NOCURR_EXACT,
+    ylim=(0.70, 1.00),
+    yticks=[0.72, 0.76, 0.80, 0.84, 0.88, 0.92, 0.96, 1.00],
     ylabel="Per-cell set-match rate",
     fname="stage_progression_exact",
+    legend_loc="lower left",
+)
+_plot(
+    ATC_PREC, DC_PREC, NOCURR_PREC,
+    ylim=(0.86, 1.00),
+    yticks=[0.88, 0.90, 0.92, 0.94, 0.96, 0.98, 1.00],
+    ylabel="Value precision",
+    fname="stage_progression_precision",
+    legend_loc="lower left",
+)
+_plot(
+    ATC_REC, DC_REC, NOCURR_REC,
+    ylim=(0.86, 1.00),
+    yticks=[0.88, 0.90, 0.92, 0.94, 0.96, 0.98, 1.00],
+    ylabel="Value recall",
+    fname="stage_progression_recall",
+    legend_loc="lower left",
 )