Spaces:

agarwalanu3103
/

clarify-rl

Running

File size: 13,536 Bytes

#!/usr/bin/env python
"""Hackathon-narrative comparison plots that go beyond `make_plots.py`.

Given the eval JSONs that `refresh_all_plots.sh` downloads from each model
repo, this script renders three artefacts targeted at judges:

1. ``06_same_base_delta.png`` — per-family delta (GRPO − base) for each
   model size, exposing where RL helps vs. hurts at each scale. This is the
   most important hackathon plot: it tells the "scale-dependent training
   response" story directly.

2. ``07_runs_summary_table.png`` — clean text table of every run's
   aggregate score, format pass rate, and per-family numbers. Ships as a
   PNG so it can drop straight into the README.

3. ``runs_summary.json`` — machine-readable version of the same table for
   downstream tooling (the blog post inlines it).

Inputs are auto-discovered from ``outputs/run_artifacts/`` so the script
stays in lock-step with whatever has actually been pushed to the Hub by
``refresh_all_plots.sh``. Anything that isn't there yet (e.g. Run 3 / Run
4 evals while training is still in flight) is just omitted from the
matrix — every plot degrades gracefully.

Why this lives outside ``make_plots.py``: ``make_plots.py`` is the
generic per-eval comparison primitive; ``compare_runs.py`` is the
opinionated, run-aware orchestrator that knows the relationship between
the runs (same model size → "delta", different model sizes → side-by-
side).
"""

from __future__ import annotations

import argparse
import json
import statistics
from collections import defaultdict
from dataclasses import dataclass
from pathlib import Path
from typing import Iterable


@dataclass(frozen=True)
class RunSpec:
    """One row of the comparison table.

    ``base_label`` cross-references the matching base entry by ``label`` so
    the same-base delta plot can pair them. ``base_label=None`` means this
    row IS itself a base.
    """

    label: str
    eval_path: Path
    base_label: str | None = None
    color: str = "tab:blue"


# Ordered for legend stability in plots and rows in the summary table.
RUN_SPECS: list[RunSpec] = [
    RunSpec(
        label="0.6B base",
        eval_path=Path("outputs/run_artifacts/v4/evals/eval_qwen3-0.6b_n50_v4.json"),
        color="tab:gray",
    ),
    RunSpec(
        label="Probe (0.6B, β=0)",
        eval_path=Path("outputs/run_artifacts/v4/evals/eval_clarify-rl-grpo-qwen3-0-6b_n50_v4.json"),
        base_label="0.6B base",
        color="tab:blue",
    ),
    RunSpec(
        label="1.7B base",
        eval_path=Path("outputs/run_artifacts/v4/evals/eval_qwen3-1.7b_n50_v4.json"),
        color="dimgray",
    ),
    RunSpec(
        label="Drift (1.7B, β=0)",
        eval_path=Path("outputs/run_artifacts/1.7B/evals/eval_clarify-rl-grpo-qwen3-1-7b_n50.json"),
        base_label="1.7B base",
        color="tab:orange",
    ),
    RunSpec(
        label="Anchor (1.7B, β=0.2)",
        # Auto-resolved from outputs/run_artifacts/1.7B-KL/evals/<latest>.json
        eval_path=Path("outputs/run_artifacts/1.7B-KL/evals"),
        base_label="1.7B base",
        color="tab:green",
    ),
    RunSpec(
        label="Restrain (1.7B, β=1.0)",
        eval_path=Path("outputs/run_artifacts/1.7B-Run6/evals"),
        base_label="1.7B base",
        color="#0d47a1",
    ),
    RunSpec(
        label="Champion (1.7B, β=0.3)",
        eval_path=Path("outputs/run_artifacts/1.7B-Run7/evals"),
        base_label="1.7B base",
        color="#ff6f00",
    ),
    RunSpec(
        label="4B base",
        eval_path=Path("outputs/run_artifacts/4B-base/evals"),
        color="darkgray",
    ),
    RunSpec(
        label="4B GRPO (Run 3)",
        eval_path=Path("outputs/run_artifacts/4B/evals"),
        base_label="4B base",
        color="tab:purple",
    ),
    RunSpec(
        label="4B-instruct",
        eval_path=Path("outputs/eval_qwen3-4b-instruct_n50_v4.json"),
        color="tab:red",
    ),
]


def _resolve_eval_path(spec: RunSpec) -> Path | None:
    """If ``spec.eval_path`` is a directory, pick the most-recently-modified
    eval JSON inside it. Otherwise return the file as-is. Missing → None.
    """
    p = spec.eval_path
    if not p.exists():
        return None
    if p.is_file():
        return p
    if p.is_dir():
        candidates = sorted(
            p.glob("eval_*.json"),
            key=lambda f: f.stat().st_mtime,
            reverse=True,
        )
        return candidates[0] if candidates else None
    return None


def _load_summary_and_results(path: Path) -> tuple[dict, list[dict]]:
    data = json.loads(path.read_text())
    return data.get("summary", {}), data.get("results", [])


def _per_family_means(results: list[dict]) -> dict[str, float]:
    """Mean final_score per task family. Treats unknown family as ``"?"``."""
    by_fam: dict[str, list[float]] = defaultdict(list)
    for r in results:
        by_fam[r.get("family", "?")].append(float(r.get("final_score", 0.0)))
    return {fam: statistics.mean(scores) if scores else 0.0 for fam, scores in by_fam.items()}


def _per_family_max(results: list[dict]) -> dict[str, float]:
    by_fam: dict[str, list[float]] = defaultdict(list)
    for r in results:
        by_fam[r.get("family", "?")].append(float(r.get("final_score", 0.0)))
    return {fam: max(scores) if scores else 0.0 for fam, scores in by_fam.items()}


# ---------------------------------------------------------------------------
# Plot 6 — same-base delta chart
# ---------------------------------------------------------------------------


def _all_families(specs: dict[str, dict]) -> list[str]:
    fams: set[str] = set()
    for entry in specs.values():
        fams.update(entry["family_means"].keys())
    return sorted(fams)


def _delta_panel(ax, specs: dict[str, dict], pairs, families, metric_key: str, ylabel: str, title: str) -> None:
    n_families = len(families)
    n_pairs = len(pairs)
    width = 0.8 / max(1, n_pairs)
    x = list(range(n_families))
    for i, (trained_label, entry) in enumerate(pairs):
        base_entry = specs[entry["base_label"]]
        delta = {
            fam: entry[metric_key].get(fam, 0.0) - base_entry[metric_key].get(fam, 0.0)
            for fam in families
        }
        ax.bar(
            [xi + (i - (n_pairs - 1) / 2) * width for xi in x],
            [delta[fam] for fam in families],
            width=width,
            label=trained_label,
            color=entry["color"],
            edgecolor="black",
            linewidth=0.5,
        )
    ax.axhline(0.0, color="black", lw=0.8)
    ax.set_xticks(x)
    ax.set_xticklabels(families, rotation=15, ha="right")
    ax.set_ylabel(ylabel)
    ax.set_title(title)
    ax.grid(alpha=0.3, axis="y")


def plot_same_base_delta(specs: dict[str, dict], out_path: Path) -> None:
    """For every (trained, base) pair, plot Δ = trained − base on TWO panels:
    left = mean per family (avg behaviour), right = max per family (peak
    capability). The right panel exposes the "capability concentration"
    finding: Run 2's mean regressed on meeting_scheduling, but its max went
    *up*, so it learned a narrower-but-stronger solver for that family.
    """
    pairs = [
        (label, entry)
        for label, entry in specs.items()
        if entry["base_label"] and entry["base_label"] in specs
    ]
    if not pairs:
        print("[skip] same-base delta — no trained vs base pairs available yet")
        return

    import matplotlib.pyplot as plt

    families = _all_families(specs)
    fig, (ax_mean, ax_max) = plt.subplots(1, 2, figsize=(max(13, len(families) * 2.0), 5.5), sharey=False)
    _delta_panel(
        ax_mean, specs, pairs, families,
        metric_key="family_means",
        ylabel="Δ avg score (GRPO − same-size base)",
        title="(a) Average behaviour\npositive = GRPO consistently helps, negative = regression",
    )
    _delta_panel(
        ax_max, specs, pairs, families,
        metric_key="family_max",
        ylabel="Δ max score (GRPO − same-size base)",
        title="(b) Peak capability\npositive = GRPO unlocks higher ceiling on at least 1 scenario",
    )

    handles, labels = ax_mean.get_legend_handles_labels()
    fig.legend(handles, labels, loc="lower center", ncol=min(4, len(pairs)), fontsize=9, bbox_to_anchor=(0.5, -0.02))
    fig.suptitle("Where GRPO helps vs. hurts, per task family", fontsize=13)
    fig.tight_layout(rect=[0, 0.05, 1, 0.96])
    fig.savefig(out_path, dpi=160, bbox_inches="tight")
    plt.close(fig)
    print(f"[ok] {out_path}")


# ---------------------------------------------------------------------------
# Plot 7 — summary table as PNG
# ---------------------------------------------------------------------------


def render_summary_table(specs: dict[str, dict], out_path: Path) -> dict:
    """Render the runs_summary as a PNG (suitable for README embed) AND
    return the same data as a dict so the JSON sibling can be written.
    """
    families = _all_families(specs)

    rows: list[dict] = []
    for label, entry in specs.items():
        s = entry["summary"]
        row = {
            "label": label,
            "model": s.get("model", "?"),
            "n": s.get("scenarios_total", "?"),
            "avg_score": float(s.get("avg_score", 0.0)),
            "format_pass_rate": float(s.get("format_pass_rate", 0.0) or 0.0),
            "completion_rate": float(s.get("completion_rate", 0.0) or 0.0),
            **{f"fam_{fam}": entry["family_means"].get(fam, 0.0) for fam in families},
            **{f"max_{fam}": entry["family_max"].get(fam, 0.0) for fam in families},
        }
        rows.append(row)

    summary = {"families": families, "rows": rows}

    # Render text as PNG
    import matplotlib.pyplot as plt
    from matplotlib import patches

    headers = ["Run", "n", "avg", "fmt%"] + [fam for fam in families]
    body: list[list[str]] = []
    for row in rows:
        body.append([
            row["label"],
            str(row["n"]),
            f"{row['avg_score']:.4f}",
            f"{row['format_pass_rate'] * 100:.0f}%",
            *[f"{row[f'fam_{fam}']:.3f}" for fam in families],
        ])

    n_cols = len(headers)
    n_rows = len(body) + 1
    char_widths = [max(len(headers[c]), max((len(b[c]) for b in body), default=0)) for c in range(n_cols)]
    total_chars = sum(char_widths)
    rel_widths = [w / total_chars for w in char_widths]
    fig_width = max(13, sum(char_widths) * 0.13)

    fig, ax = plt.subplots(figsize=(fig_width, 0.6 * n_rows + 0.8))
    ax.axis("off")
    table = ax.table(
        cellText=body,
        colLabels=headers,
        loc="center",
        cellLoc="center",
        colLoc="center",
        colWidths=rel_widths,
    )
    table.auto_set_font_size(False)
    table.set_fontsize(10)
    table.scale(1.0, 1.5)
    for c in range(n_cols):
        table[(0, c)].set_facecolor("#cccccc")
        table[(0, c)].set_text_props(weight="bold")

    # highlight winning rows per column
    for c, fam in enumerate(headers):
        if c < 4:
            continue
        col_vals = [row[f"fam_{fam}"] for row in rows]
        if not col_vals:
            continue
        max_v = max(col_vals)
        if max_v <= 0:
            continue
        for r, row in enumerate(rows, start=1):
            if abs(row[f"fam_{fam}"] - max_v) < 1e-9:
                table[(r, c)].set_facecolor("#cdeac0")  # green
                table[(r, c)].set_text_props(weight="bold")

    ax.set_title("ClarifyRL — per-run × per-family scoreboard (n=50, eval v4)\nGreen cell = best score in that family", pad=14)
    fig.tight_layout()
    fig.savefig(out_path, dpi=160, bbox_inches="tight")
    plt.close(fig)
    print(f"[ok] {out_path}")

    return summary


# ---------------------------------------------------------------------------
# main
# ---------------------------------------------------------------------------


def main(argv: Iterable[str] | None = None) -> None:
    p = argparse.ArgumentParser(description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter)
    p.add_argument("--out-dir", default="plots", help="Directory for PNGs + runs_summary.json")
    args = p.parse_args(list(argv) if argv is not None else None)

    out_dir = Path(args.out_dir)
    out_dir.mkdir(parents=True, exist_ok=True)

    specs: dict[str, dict] = {}
    for spec in RUN_SPECS:
        path = _resolve_eval_path(spec)
        if path is None:
            print(f"[skip] {spec.label}: no eval JSON yet at {spec.eval_path}")
            continue
        summary, results = _load_summary_and_results(path)
        specs[spec.label] = {
            "summary": summary,
            "family_means": _per_family_means(results),
            "family_max": _per_family_max(results),
            "base_label": spec.base_label,
            "color": spec.color,
            "eval_path": str(path),
        }

    if not specs:
        print("[err] no eval JSONs found at all — nothing to plot")
        return

    print(f"\n[load] {len(specs)} eval JSON(s):")
    for lbl, entry in specs.items():
        print(f"  - {lbl}: {entry['eval_path']}")
    print()

    plot_same_base_delta(specs, out_dir / "06_same_base_delta.png")
    summary = render_summary_table(specs, out_dir / "07_runs_summary_table.png")

    json_path = out_dir / "runs_summary.json"
    json_path.write_text(json.dumps(summary, indent=2))
    print(f"[ok] {json_path}")
    print()
    print(f"All comparison artifacts written to: {out_dir.resolve()}")


if __name__ == "__main__":
    main()