clarify-rl / scripts /refresh_all_plots.sh
agarwalanu3103's picture
plots: add training progression + diagnostics, drop W&B links
099bec8 verified
#!/usr/bin/env bash
# Download all eval JSONs + log_history files from the Hub, then regenerate
# the 6 submission plots with the same-base before/after comparison.
#
# Usage:
# HF_TOKEN=hf_... (Acct 1: agarwalanu3103) used by default
# HF_TOKEN_KANAN=hf_... (Acct 2: Kanan2005, owns Run 3 repo) β€” optional
# HF_TOKEN_MNIT=hf_... (Acct 3: 2022uec1542, owns Run 4 repo) β€” optional
# ./scripts/refresh_all_plots.sh
#
# Repos covered:
# Run 1: agarwalanu3103/clarify-rl-grpo-qwen3-0-6b β€” 0.6B GRPO (beta=0, LR=1e-6)
# Run 2: agarwalanu3103/clarify-rl-grpo-qwen3-1-7b β€” 1.7B GRPO (beta=0, LR=1e-6)
# Run 3: Kanan2005/clarify-rl-grpo-qwen3-4b β€” 4B GRPO (beta=0, LR=1e-6)
# Run 4: 2022uec1542/clarify-rl-grpo-qwen3-1-7b β€” 1.7B GRPO (beta=0.2, LR=5e-7) β€” KL-anchored
set -euo pipefail
: "${HF_TOKEN:?HF_TOKEN required (Acct 1: agarwalanu3103)}"
OUT=outputs/run_artifacts
mkdir -p "$OUT" plots
source .venv/bin/activate
# Forward all three tokens into the python pull so each repo can be reached
# with the right credentials. Kanan/MNIT are optional β€” we skip those repos
# silently if not provided.
HF_TOKEN_KANAN="${HF_TOKEN_KANAN:-}" \
HF_TOKEN_MNIT="${HF_TOKEN_MNIT:-}" \
HF_TOKEN_AGARWAL="${HF_TOKEN}" \
python <<'PY'
import os
import json
import truststore; truststore.inject_into_ssl()
from huggingface_hub import HfApi, hf_hub_download
from pathlib import Path
out = Path("outputs/run_artifacts")
out.mkdir(parents=True, exist_ok=True)
# Map: short label β†’ (repo_id, env-var name holding the token for that repo)
REPOS = {
"0.6B": ("agarwalanu3103/clarify-rl-grpo-qwen3-0-6b", "HF_TOKEN_AGARWAL"),
"1.7B": ("agarwalanu3103/clarify-rl-grpo-qwen3-1-7b", "HF_TOKEN_AGARWAL"),
"4B": ("Kanan2005/clarify-rl-grpo-qwen3-4b", "HF_TOKEN_KANAN"),
"1.7B-KL": ("2022uec1542/clarify-rl-grpo-qwen3-1-7b", "HF_TOKEN_MNIT"),
}
for size, (repo, token_var) in REPOS.items():
token = os.environ.get(token_var) or ""
if not token:
print(f"[skip] {size} {repo}: {token_var} not set")
continue
api = HfApi(token=token)
try:
files = api.list_repo_files(repo)
except Exception as exc:
print(f"[skip] {repo}: {exc}")
continue
for f in files:
if f.startswith("evals/") or f in ("log_history.json", "training_summary.json"):
try:
local = hf_hub_download(
repo_id=repo,
filename=f,
token=token,
local_dir=str(out / size),
)
print(f"[ok] {size}/{f}")
except Exception as exc:
print(f"[err] {size}/{f}: {exc}")
PY
echo
echo "Files now under outputs/run_artifacts:"
find outputs/run_artifacts -name '*.json' | sort
# Build the eval-flag list for whatever was actually downloaded.
EVAL_FLAGS=()
add_if() {
if [ -f "$1" ]; then
EVAL_FLAGS+=(--eval "$2=$1")
echo " + ${2}: ${1}"
fi
}
echo
echo "Building eval list:"
add_if outputs/eval_policy_v4.json "policy (deterministic)"
add_if outputs/run_artifacts/v4/evals/eval_qwen3-0.6b_n50_v4.json "0.6B base"
add_if outputs/run_artifacts/v4/evals/eval_clarify-rl-grpo-qwen3-0-6b_n50_v4.json "0.6B GRPO (Run 1)"
add_if outputs/run_artifacts/v4/evals/eval_qwen3-1.7b_n50_v4.json "1.7B base"
add_if outputs/run_artifacts/1.7B/evals/eval_clarify-rl-grpo-qwen3-1-7b_n50.json "1.7B GRPO no-KL (Run 2)"
add_if outputs/run_artifacts/4B-base/evals/eval_qwen3-4b_qwen3-4b-base_n50_v4.json "4B base"
# After Run 4 finishes its first eval, refresh_all_plots.sh will pick up the new
# JSON automatically; until then it's silently skipped.
for f in outputs/run_artifacts/1.7B-KL/evals/eval_*_n50_v4.json; do
[ -f "$f" ] && add_if "$f" "1.7B GRPO +KL (Run 4)"
done
for f in outputs/run_artifacts/4B/evals/eval_*_n50_v4.json; do
[ -f "$f" ] && add_if "$f" "4B GRPO (Run 3)"
done
add_if outputs/eval_qwen3-4b-instruct_n50_v4.json "4B-instruct"
LOG_FLAGS=()
[ -f outputs/run1_artifacts/log_history.json ] && LOG_FLAGS+=(--log-history "0.6B GRPO (Run 1, beta=0)=outputs/run1_artifacts/log_history.json")
[ -f outputs/run_artifacts/1.7B/log_history.json ] && LOG_FLAGS+=(--log-history "1.7B GRPO (Run 2, beta=0)=outputs/run_artifacts/1.7B/log_history.json")
[ -f outputs/run2_artifacts/log_history_partial.json ] && [ ! -f outputs/run_artifacts/1.7B/log_history.json ] && LOG_FLAGS+=(--log-history "1.7B GRPO (Run 2, in progress)=outputs/run2_artifacts/log_history_partial.json")
[ -f outputs/run_artifacts/4B/log_history.json ] && LOG_FLAGS+=(--log-history "4B GRPO (Run 3, beta=0)=outputs/run_artifacts/4B/log_history.json")
[ -f outputs/run_artifacts/1.7B-KL/log_history.json ] && LOG_FLAGS+=(--log-history "1.7B GRPO (Run 4, beta=0.2)=outputs/run_artifacts/1.7B-KL/log_history.json")
# Until each run's final log_history.json is pushed to its repo, fall back to
# the partial JSON that monitor_training scrapes from live job logs. This way
# the reward curves stay current even mid-training.
[ -f outputs/run4_artifacts/log_history_partial.json ] && [ ! -f outputs/run_artifacts/1.7B-KL/log_history.json ] && LOG_FLAGS+=(--log-history "1.7B GRPO (Run 4, beta=0.2 in-progress)=outputs/run4_artifacts/log_history_partial.json")
[ -f outputs/run3_artifacts/log_history_partial.json ] && [ ! -f outputs/run_artifacts/4B/log_history.json ] && LOG_FLAGS+=(--log-history "4B GRPO (Run 3, beta=0 in-progress)=outputs/run3_artifacts/log_history_partial.json")
echo
echo "Running make_plots.py:"
python scripts/make_plots.py "${LOG_FLAGS[@]}" "${EVAL_FLAGS[@]}" --out-dir plots
# Also build the hackathon-narrative plots: per-family same-base delta and
# the 4-run summary table. These need richer logic than make_plots.py supports,
# so they live in compare_runs.py.
echo
echo "Running compare_runs.py:"
python scripts/compare_runs.py --out-dir plots