| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| |
|
| | import argparse |
| | import glob |
| | import json |
| | import os |
| | import re |
| | from dataclasses import dataclass |
| | from typing import Dict, List, Optional, Tuple |
| |
|
| | import numpy as np |
| | import pandas as pd |
| | import matplotlib.pyplot as plt |
| |
|
| |
|
| | |
| | |
| | |
| | CONFIG_META = { |
| | "A": {"hno": "HNO3", "variant": "0-shot"}, |
| | "B": {"hno": "HNO3", "variant": "CoT"}, |
| | "C": {"hno": "HNO3", "variant": "Fake CoT"}, |
| | "D": {"hno": "HNO2", "variant": "0-shot"}, |
| | "E": {"hno": "HNO2", "variant": "CoT"}, |
| | "F": {"hno": "HNO2", "variant": "Fake CoT"}, |
| | "G": {"hno": "HNO1", "variant": "0-shot"}, |
| | "H": {"hno": "HNO1", "variant": "CoT"}, |
| | "I": {"hno": "HNO1", "variant": "Fake CoT"}, |
| | } |
| |
|
| | EVAL_TYPE_ORDER = [ |
| | "Original", |
| | "Paraphrase P1", |
| | "Paraphrase P2", |
| | "Paraphrase P3", |
| | "Paraphrase P4", |
| | "Paraphrase P5", |
| | "Reverse R1", |
| | "Reverse R2", |
| | "Reverse R3", |
| | "Aggregate A1", |
| | "Aggregate A2", |
| | "Aggregate A3", |
| | "Aggregate A4", |
| | ] |
| |
|
| | |
| | RE_P = re.compile(r"_P([1-5])(?:\.json|_results\.json)$") |
| | RE_R = re.compile(r"_R([1-3])(?:\.json|_results\.json)$") |
| | RE_A = re.compile(r"_A([1-4])(?:\.json|_results\.json)$") |
| |
|
| |
|
| | def infer_eval_type_from_filename(fn: str) -> str: |
| | base = os.path.basename(fn) |
| | m = RE_P.search(base) |
| | if m: |
| | return f"Paraphrase P{m.group(1)}" |
| | m = RE_R.search(base) |
| | if m: |
| | return f"Reverse R{m.group(1)}" |
| | m = RE_A.search(base) |
| | if m: |
| | return f"Aggregate A{m.group(1)}" |
| | |
| | return "Original" |
| |
|
| |
|
| | def safe_read_json(path: str): |
| | try: |
| | with open(path, "r", encoding="utf-8") as f: |
| | return json.load(f) |
| | except Exception: |
| | return None |
| |
|
| |
|
| | def list_result_files(config_dir: str) -> List[str]: |
| | |
| | return sorted(glob.glob(os.path.join(config_dir, "*_results.json"))) |
| |
|
| |
|
| | def extract_steps_from_one_entry(entry: dict) -> List[int]: |
| | steps = [] |
| | for k in entry.keys(): |
| | if k.startswith("step_"): |
| | try: |
| | steps.append(int(k.split("_", 1)[1])) |
| | except Exception: |
| | pass |
| | return sorted(set(steps)) |
| |
|
| |
|
| | def summarize_results_file(path: str) -> Optional[pd.DataFrame]: |
| | """ |
| | Return a dataframe with columns: step, accuracy_mean, n |
| | computed from entry["step_<s>"]["accuracy"] for all entries. |
| | """ |
| | data = safe_read_json(path) |
| | if not isinstance(data, list) or len(data) == 0: |
| | return None |
| |
|
| | steps = extract_steps_from_one_entry(data[0]) |
| | if not steps: |
| | |
| | for e in data[:50]: |
| | steps = extract_steps_from_one_entry(e) |
| | if steps: |
| | break |
| | if not steps: |
| | return None |
| |
|
| | rows = [] |
| | for s in steps: |
| | k = f"step_{s}" |
| | accs = [] |
| | for e in data: |
| | v = e.get(k) or {} |
| | a = v.get("accuracy", None) |
| | if isinstance(a, (int, float)): |
| | accs.append(float(a)) |
| | if len(accs) == 0: |
| | continue |
| | rows.append( |
| | { |
| | "step": s, |
| | "accuracy_mean": float(np.mean(accs)), |
| | "n": int(len(accs)), |
| | } |
| | ) |
| |
|
| | if not rows: |
| | return None |
| | return pd.DataFrame(rows).sort_values("step").reset_index(drop=True) |
| |
|
| |
|
| | def build_long_dataframe(base_dir: str, configs: List[str]) -> pd.DataFrame: |
| | """ |
| | Build long-form df: |
| | config, hno, variant, eval_file, eval_type, step, accuracy, n |
| | """ |
| | all_rows = [] |
| |
|
| | for cfg in configs: |
| | config_dir = os.path.join(base_dir, cfg) |
| | if not os.path.isdir(config_dir): |
| | continue |
| |
|
| | meta = CONFIG_META.get(cfg, {"hno": "UNKNOWN", "variant": "UNKNOWN"}) |
| | files = list_result_files(config_dir) |
| |
|
| | for fpath in files: |
| | eval_type = infer_eval_type_from_filename(fpath) |
| | summary = summarize_results_file(fpath) |
| | if summary is None: |
| | continue |
| |
|
| | eval_file = os.path.basename(fpath).replace("_results.json", ".json") |
| |
|
| | for _, r in summary.iterrows(): |
| | all_rows.append( |
| | { |
| | "config": cfg, |
| | "hno": meta["hno"], |
| | "variant": meta["variant"], |
| | "eval_file": eval_file, |
| | "eval_type": eval_type, |
| | "step": int(r["step"]), |
| | "accuracy": float(r["accuracy_mean"]), |
| | "n": int(r["n"]), |
| | } |
| | ) |
| |
|
| | df = pd.DataFrame(all_rows) |
| | if df.empty: |
| | return df |
| |
|
| | |
| | df["eval_type"] = pd.Categorical(df["eval_type"], categories=EVAL_TYPE_ORDER, ordered=True) |
| |
|
| | |
| | return df.sort_values(["hno", "variant", "config", "eval_type", "step"]).reset_index(drop=True) |
| |
|
| |
|
| | def ensure_dir(path: str) -> None: |
| | os.makedirs(path, exist_ok=True) |
| |
|
| |
|
| | def save_fig(fig: plt.Figure, out_dir: str, name: str) -> None: |
| | ensure_dir(out_dir) |
| | png = os.path.join(out_dir, f"{name}.png") |
| | pdf = os.path.join(out_dir, f"{name}.pdf") |
| | fig.savefig(png, dpi=200, bbox_inches="tight") |
| | fig.savefig(pdf, bbox_inches="tight") |
| | plt.close(fig) |
| |
|
| |
|
| | def pick_final_step(df: pd.DataFrame) -> int: |
| | |
| | steps = sorted(df["step"].unique().tolist()) |
| | if not steps: |
| | return 0 |
| | if 10000 in steps: |
| | return 10000 |
| | return steps[-1] |
| |
|
| |
|
| | |
| | |
| | |
| | def fig_scaling_curves_overall(df: pd.DataFrame, out_dir: str) -> None: |
| | """ |
| | Q4: Accuracy vs step (scaling) for each config, averaged over all eval files/types. |
| | """ |
| | if df.empty: |
| | return |
| |
|
| | |
| | g = ( |
| | df.groupby(["config", "hno", "variant", "step"], as_index=False)["accuracy"] |
| | .mean() |
| | .rename(columns={"accuracy": "acc_mean_overall"}) |
| | ) |
| |
|
| | |
| | for hno in ["HNO1", "HNO2", "HNO3"]: |
| | gh = g[g["hno"] == hno].copy() |
| | if gh.empty: |
| | continue |
| |
|
| | fig = plt.figure() |
| | ax = fig.add_subplot(1, 1, 1) |
| | for cfg, sub in gh.groupby("config"): |
| | sub = sub.sort_values("step") |
| | ax.plot(sub["step"], sub["acc_mean_overall"], marker="o", linewidth=1, label=f"{cfg} ({CONFIG_META[cfg]['variant']})") |
| |
|
| | ax.set_title(f"Scaling (Accuracy vs Steps) — {hno} — Mean over all eval sets") |
| | ax.set_xlabel("Training step (checkpoint)") |
| | ax.set_ylabel("Accuracy") |
| | ax.set_ylim(0.0, 1.0) |
| | ax.grid(True, linewidth=0.5, alpha=0.5) |
| | ax.legend(loc="lower right", fontsize=8) |
| |
|
| | save_fig(fig, out_dir, f"Q4_scaling_curves_overall_{hno}") |
| |
|
| |
|
| | def fig_scaling_curves_by_eval_type(df: pd.DataFrame, out_dir: str) -> None: |
| | """ |
| | Q2/Q4: Accuracy vs step, separated by eval_type (hardness differences show up as gaps). |
| | Produces one figure per config (may be many, but comprehensive). |
| | """ |
| | if df.empty: |
| | return |
| |
|
| | for cfg in sorted(df["config"].unique().tolist()): |
| | sub = df[df["config"] == cfg].copy() |
| | if sub.empty: |
| | continue |
| |
|
| | fig = plt.figure(figsize=(10, 6)) |
| | ax = fig.add_subplot(1, 1, 1) |
| |
|
| | for et, etdf in sub.groupby("eval_type"): |
| | etdf = etdf.groupby("step", as_index=False)["accuracy"].mean().sort_values("step") |
| | ax.plot(etdf["step"], etdf["accuracy"], marker="o", linewidth=1, label=str(et)) |
| |
|
| | meta = CONFIG_META.get(cfg, {}) |
| | ax.set_title(f"Scaling by Eval Set — Config {cfg} ({meta.get('hno','?')}, {meta.get('variant','?')})") |
| | ax.set_xlabel("Training step (checkpoint)") |
| | ax.set_ylabel("Accuracy") |
| | ax.set_ylim(0.0, 1.0) |
| | ax.grid(True, linewidth=0.5, alpha=0.5) |
| | ax.legend(loc="lower right", fontsize=8, ncol=2) |
| |
|
| | save_fig(fig, out_dir, f"Q2Q4_scaling_by_evaltype_config_{cfg}") |
| |
|
| |
|
| | def fig_entropy_effect_final(df: pd.DataFrame, out_dir: str) -> None: |
| | """ |
| | Q1: Compare HNO1 vs HNO2 vs HNO3 at final step, controlling for variant (0-shot/CoT/Fake CoT). |
| | We plot: |
| | - Final accuracy on Original eval |
| | - Final accuracy averaged over all eval types |
| | """ |
| | if df.empty: |
| | return |
| |
|
| | final_step = pick_final_step(df) |
| |
|
| | |
| | d1 = df[(df["step"] == final_step) & (df["eval_type"] == "Original")].copy() |
| | if not d1.empty: |
| | g1 = d1.groupby(["hno", "variant"], as_index=False)["accuracy"].mean() |
| |
|
| | fig = plt.figure() |
| | ax = fig.add_subplot(1, 1, 1) |
| |
|
| | |
| | hnos = ["HNO1", "HNO2", "HNO3"] |
| | variants = ["0-shot", "CoT", "Fake CoT"] |
| | x = np.arange(len(hnos)) |
| | width = 0.25 |
| |
|
| | for j, v in enumerate(variants): |
| | vals = [] |
| | for h in hnos: |
| | m = g1[(g1["hno"] == h) & (g1["variant"] == v)] |
| | vals.append(float(m["accuracy"].iloc[0]) if len(m) else np.nan) |
| | ax.bar(x + (j - 1) * width, vals, width=width, label=v) |
| |
|
| | ax.set_title(f"Q1 Entropy Effect — Final step={final_step} — Original eval only") |
| | ax.set_xlabel("Training entropy level (HNO)") |
| | ax.set_ylabel("Accuracy") |
| | ax.set_xticks(x) |
| | ax.set_xticklabels(hnos) |
| | ax.set_ylim(0.0, 1.0) |
| | ax.grid(True, axis="y", linewidth=0.5, alpha=0.5) |
| | ax.legend(loc="lower right", fontsize=9) |
| |
|
| | save_fig(fig, out_dir, f"Q1_entropy_effect_finalstep_{final_step}_original") |
| |
|
| | |
| | d2 = df[df["step"] == final_step].copy() |
| | if not d2.empty: |
| | g2 = d2.groupby(["config", "hno", "variant"], as_index=False)["accuracy"].mean() |
| | g2 = g2.groupby(["hno", "variant"], as_index=False)["accuracy"].mean() |
| |
|
| | fig = plt.figure() |
| | ax = fig.add_subplot(1, 1, 1) |
| |
|
| | hnos = ["HNO1", "HNO2", "HNO3"] |
| | variants = ["0-shot", "CoT", "Fake CoT"] |
| | x = np.arange(len(hnos)) |
| | width = 0.25 |
| |
|
| | for j, v in enumerate(variants): |
| | vals = [] |
| | for h in hnos: |
| | m = g2[(g2["hno"] == h) & (g2["variant"] == v)] |
| | vals.append(float(m["accuracy"].iloc[0]) if len(m) else np.nan) |
| | ax.bar(x + (j - 1) * width, vals, width=width, label=v) |
| |
|
| | ax.set_title(f"Q1 Entropy Effect — Final step={final_step} — Mean over all eval sets") |
| | ax.set_xlabel("Training entropy level (HNO)") |
| | ax.set_ylabel("Accuracy") |
| | ax.set_xticks(x) |
| | ax.set_xticklabels(hnos) |
| | ax.set_ylim(0.0, 1.0) |
| | ax.grid(True, axis="y", linewidth=0.5, alpha=0.5) |
| | ax.legend(loc="lower right", fontsize=9) |
| |
|
| | save_fig(fig, out_dir, f"Q1_entropy_effect_finalstep_{final_step}_overall") |
| |
|
| |
|
| | def fig_label_structure_effect(df: pd.DataFrame, out_dir: str) -> None: |
| | """ |
| | Q3: Compare (0-shot vs CoT vs Fake CoT) within each HNO level across steps. |
| | Use mean over eval types to avoid 12-line clutter. |
| | """ |
| | if df.empty: |
| | return |
| |
|
| | g = df.groupby(["hno", "variant", "step"], as_index=False)["accuracy"].mean() |
| |
|
| | for hno in ["HNO1", "HNO2", "HNO3"]: |
| | sub = g[g["hno"] == hno].copy() |
| | if sub.empty: |
| | continue |
| |
|
| | fig = plt.figure() |
| | ax = fig.add_subplot(1, 1, 1) |
| |
|
| | for v, vdf in sub.groupby("variant"): |
| | vdf = vdf.sort_values("step") |
| | ax.plot(vdf["step"], vdf["accuracy"], marker="o", linewidth=1, label=v) |
| |
|
| | ax.set_title(f"Q3 Label Context Structure — {hno} — Mean over all eval sets") |
| | ax.set_xlabel("Training step (checkpoint)") |
| | ax.set_ylabel("Accuracy") |
| | ax.set_ylim(0.0, 1.0) |
| | ax.grid(True, linewidth=0.5, alpha=0.5) |
| | ax.legend(loc="lower right", fontsize=9) |
| |
|
| | save_fig(fig, out_dir, f"Q3_label_structure_over_steps_{hno}") |
| |
|
| |
|
| | def fig_eval_hardness_final(df: pd.DataFrame, out_dir: str) -> None: |
| | """ |
| | Q2: "Hardness" by eval set type at final step: |
| | - Average across all configs (global hardness) |
| | - Also per HNO level (since training data differs) |
| | """ |
| | if df.empty: |
| | return |
| |
|
| | final_step = pick_final_step(df) |
| | d = df[df["step"] == final_step].copy() |
| | if d.empty: |
| | return |
| |
|
| | |
| | g_all = d.groupby(["eval_type"], as_index=False)["accuracy"].mean() |
| | g_all = g_all.sort_values("eval_type") |
| |
|
| | fig = plt.figure(figsize=(11, 5)) |
| | ax = fig.add_subplot(1, 1, 1) |
| | x = np.arange(len(g_all)) |
| | ax.bar(x, g_all["accuracy"].to_numpy()) |
| | ax.set_title(f"Q2 Eval Hardness — Final step={final_step} — Mean across ALL configs") |
| | ax.set_xlabel("Eval set type") |
| | ax.set_ylabel("Accuracy") |
| | ax.set_ylim(0.0, 1.0) |
| | ax.set_xticks(x) |
| | ax.set_xticklabels([str(v) for v in g_all["eval_type"].tolist()], rotation=35, ha="right") |
| | ax.grid(True, axis="y", linewidth=0.5, alpha=0.5) |
| |
|
| | save_fig(fig, out_dir, f"Q2_eval_hardness_finalstep_{final_step}_allconfigs") |
| |
|
| | |
| | for hno in ["HNO1", "HNO2", "HNO3"]: |
| | dh = d[d["hno"] == hno].copy() |
| | if dh.empty: |
| | continue |
| | gh = dh.groupby(["eval_type"], as_index=False)["accuracy"].mean().sort_values("eval_type") |
| |
|
| | fig = plt.figure(figsize=(11, 5)) |
| | ax = fig.add_subplot(1, 1, 1) |
| | x = np.arange(len(gh)) |
| | ax.bar(x, gh["accuracy"].to_numpy()) |
| | ax.set_title(f"Q2 Eval Hardness — {hno} — Final step={final_step} — Mean across configs") |
| | ax.set_xlabel("Eval set type") |
| | ax.set_ylabel("Accuracy") |
| | ax.set_ylim(0.0, 1.0) |
| | ax.set_xticks(x) |
| | ax.set_xticklabels([str(v) for v in gh["eval_type"].tolist()], rotation=35, ha="right") |
| | ax.grid(True, axis="y", linewidth=0.5, alpha=0.5) |
| |
|
| | save_fig(fig, out_dir, f"Q2_eval_hardness_finalstep_{final_step}_{hno}") |
| |
|
| |
|
| | def fig_training_accuracy_proxy(df: pd.DataFrame, out_dir: str) -> None: |
| | """ |
| | If you treat "Original" trimmed eval (from train distribution) as a proxy for "learning/training accuracy", |
| | plot it vs steps for each config, and also aggregate per HNO/variant. |
| | (If you have true train-set eval elsewhere, point the script at those results similarly.) |
| | """ |
| | if df.empty: |
| | return |
| |
|
| | d = df[df["eval_type"] == "Original"].copy() |
| | if d.empty: |
| | return |
| |
|
| | |
| | fig = plt.figure(figsize=(10, 6)) |
| | ax = fig.add_subplot(1, 1, 1) |
| | for cfg, sub in d.groupby("config"): |
| | sub = sub.groupby("step", as_index=False)["accuracy"].mean().sort_values("step") |
| | ax.plot(sub["step"], sub["accuracy"], marker="o", linewidth=1, label=cfg) |
| | ax.set_title("Training-Accuracy Proxy — Original eval only — per config") |
| | ax.set_xlabel("Training step (checkpoint)") |
| | ax.set_ylabel("Accuracy") |
| | ax.set_ylim(0.0, 1.0) |
| | ax.grid(True, linewidth=0.5, alpha=0.5) |
| | ax.legend(loc="lower right", fontsize=8, ncol=3) |
| | save_fig(fig, out_dir, "Training_accuracy_proxy_original_per_config") |
| |
|
| | |
| | g = d.groupby(["hno", "variant", "step"], as_index=False)["accuracy"].mean() |
| | for hno in ["HNO1", "HNO2", "HNO3"]: |
| | sub = g[g["hno"] == hno].copy() |
| | if sub.empty: |
| | continue |
| | fig = plt.figure() |
| | ax = fig.add_subplot(1, 1, 1) |
| | for v, vdf in sub.groupby("variant"): |
| | vdf = vdf.sort_values("step") |
| | ax.plot(vdf["step"], vdf["accuracy"], marker="o", linewidth=1, label=v) |
| | ax.set_title(f"Training-Accuracy Proxy — {hno} — Original eval only") |
| | ax.set_xlabel("Training step (checkpoint)") |
| | ax.set_ylabel("Accuracy") |
| | ax.set_ylim(0.0, 1.0) |
| | ax.grid(True, linewidth=0.5, alpha=0.5) |
| | ax.legend(loc="lower right", fontsize=9) |
| | save_fig(fig, out_dir, f"Training_accuracy_proxy_original_{hno}") |
| |
|
| |
|
| | def export_summary_tables(df: pd.DataFrame, out_dir: str) -> None: |
| | """ |
| | Save a couple CSVs that are useful for the report: |
| | - long dataframe |
| | - final-step pivot tables |
| | """ |
| | if df.empty: |
| | return |
| | ensure_dir(out_dir) |
| |
|
| | long_csv = os.path.join(out_dir, "summary_long.csv") |
| | df.to_csv(long_csv, index=False) |
| |
|
| | final_step = pick_final_step(df) |
| | d = df[df["step"] == final_step].copy() |
| |
|
| | |
| | pivot1 = ( |
| | d.groupby(["config", "hno", "variant", "eval_type"], as_index=False)["accuracy"] |
| | .mean() |
| | .pivot_table(index=["config", "hno", "variant"], columns="eval_type", values="accuracy", aggfunc="mean") |
| | ) |
| | pivot1.to_csv(os.path.join(out_dir, f"finalstep_{final_step}_pivot_config_by_evaltype.csv")) |
| |
|
| | |
| | pivot2 = ( |
| | d.groupby(["hno", "variant", "eval_type"], as_index=False)["accuracy"] |
| | .mean() |
| | .pivot_table(index=["hno", "variant"], columns="eval_type", values="accuracy", aggfunc="mean") |
| | ) |
| | pivot2.to_csv(os.path.join(out_dir, f"finalstep_{final_step}_pivot_hno_variant_by_evaltype.csv")) |
| |
|
| |
|
| | def main(): |
| | ap = argparse.ArgumentParser() |
| | ap.add_argument("--base_dir", type=str, default="/workspace/v121rc_exp1", help="Base exp dir containing A..I") |
| | ap.add_argument("--out_dir", type=str, default="/workspace/v121rc_exp1/FIGS", help="Where to save figures") |
| | ap.add_argument( |
| | "--configs", |
| | type=str, |
| | default="ABCDEFGHI", |
| | help="Which configs to include, e.g. ABC or ABCDEFGHI", |
| | ) |
| | args = ap.parse_args() |
| |
|
| | configs = [c for c in args.configs if c in CONFIG_META] |
| | if not configs: |
| | raise SystemExit("No valid configs selected. Use --configs like ABCDEFGHI.") |
| |
|
| | df = build_long_dataframe(args.base_dir, configs) |
| | if df.empty: |
| | raise SystemExit( |
| | "No results found. Check that /workspace/v121rc_exp1/{A..I} contain '*_results.json' " |
| | "with 'step_<n>' fields." |
| | ) |
| |
|
| | ensure_dir(args.out_dir) |
| |
|
| | |
| | export_summary_tables(df, args.out_dir) |
| |
|
| | |
| | fig_training_accuracy_proxy(df, args.out_dir) |
| | fig_scaling_curves_overall(df, args.out_dir) |
| | fig_label_structure_effect(df, args.out_dir) |
| | fig_entropy_effect_final(df, args.out_dir) |
| | fig_eval_hardness_final(df, args.out_dir) |
| |
|
| | |
| | fig_scaling_curves_by_eval_type(df, args.out_dir) |
| |
|
| | print(f"Done. Figures + CSV summaries saved to: {args.out_dir}") |
| |
|
| |
|
| | if __name__ == "__main__": |
| | main() |
| |
|