Spaces:
Sleeping
Sleeping
| # Download all eval JSONs + log_history files from the Hub, then regenerate | |
| # the 6 submission plots with the same-base before/after comparison. | |
| # | |
| # Usage: | |
| # HF_TOKEN=hf_... (Acct 1: agarwalanu3103) used by default | |
| # HF_TOKEN_KANAN=hf_... (Acct 2: Kanan2005, owns Run 3 repo) β optional | |
| # HF_TOKEN_MNIT=hf_... (Acct 3: 2022uec1542, owns Run 4 repo) β optional | |
| # ./scripts/refresh_all_plots.sh | |
| # | |
| # Repos covered: | |
| # Run 1: agarwalanu3103/clarify-rl-grpo-qwen3-0-6b β 0.6B GRPO (beta=0, LR=1e-6) | |
| # Run 2: agarwalanu3103/clarify-rl-grpo-qwen3-1-7b β 1.7B GRPO (beta=0, LR=1e-6) | |
| # Run 3: Kanan2005/clarify-rl-grpo-qwen3-4b β 4B GRPO (beta=0, LR=1e-6) | |
| # Run 4: 2022uec1542/clarify-rl-grpo-qwen3-1-7b β 1.7B GRPO (beta=0.2, LR=5e-7) β KL-anchored | |
| set -euo pipefail | |
| : "${HF_TOKEN:?HF_TOKEN required (Acct 1: agarwalanu3103)}" | |
| OUT=outputs/run_artifacts | |
| mkdir -p "$OUT" plots | |
| source .venv/bin/activate | |
| # Forward all three tokens into the python pull so each repo can be reached | |
| # with the right credentials. Kanan/MNIT are optional β we skip those repos | |
| # silently if not provided. | |
| HF_TOKEN_KANAN="${HF_TOKEN_KANAN:-}" \ | |
| HF_TOKEN_MNIT="${HF_TOKEN_MNIT:-}" \ | |
| HF_TOKEN_AGARWAL="${HF_TOKEN}" \ | |
| python <<'PY' | |
| import os | |
| import json | |
| import truststore; truststore.inject_into_ssl() | |
| from huggingface_hub import HfApi, hf_hub_download | |
| from pathlib import Path | |
| out = Path("outputs/run_artifacts") | |
| out.mkdir(parents=True, exist_ok=True) | |
| # Map: short label β (repo_id, env-var name holding the token for that repo) | |
| REPOS = { | |
| "0.6B": ("agarwalanu3103/clarify-rl-grpo-qwen3-0-6b", "HF_TOKEN_AGARWAL"), | |
| "1.7B": ("agarwalanu3103/clarify-rl-grpo-qwen3-1-7b", "HF_TOKEN_AGARWAL"), | |
| "4B": ("Kanan2005/clarify-rl-grpo-qwen3-4b", "HF_TOKEN_KANAN"), | |
| "1.7B-KL": ("2022uec1542/clarify-rl-grpo-qwen3-1-7b", "HF_TOKEN_MNIT"), | |
| } | |
| for size, (repo, token_var) in REPOS.items(): | |
| token = os.environ.get(token_var) or "" | |
| if not token: | |
| print(f"[skip] {size} {repo}: {token_var} not set") | |
| continue | |
| api = HfApi(token=token) | |
| try: | |
| files = api.list_repo_files(repo) | |
| except Exception as exc: | |
| print(f"[skip] {repo}: {exc}") | |
| continue | |
| for f in files: | |
| if f.startswith("evals/") or f in ("log_history.json", "training_summary.json"): | |
| try: | |
| local = hf_hub_download( | |
| repo_id=repo, | |
| filename=f, | |
| token=token, | |
| local_dir=str(out / size), | |
| ) | |
| print(f"[ok] {size}/{f}") | |
| except Exception as exc: | |
| print(f"[err] {size}/{f}: {exc}") | |
| PY | |
| echo | |
| echo "Files now under outputs/run_artifacts:" | |
| find outputs/run_artifacts -name '*.json' | sort | |
| # Build the eval-flag list for whatever was actually downloaded. | |
| EVAL_FLAGS=() | |
| add_if() { | |
| if [ -f "$1" ]; then | |
| EVAL_FLAGS+=(--eval "$2=$1") | |
| echo " + ${2}: ${1}" | |
| fi | |
| } | |
| echo | |
| echo "Building eval list:" | |
| add_if outputs/eval_policy_v4.json "policy (deterministic)" | |
| add_if outputs/run_artifacts/v4/evals/eval_qwen3-0.6b_n50_v4.json "0.6B base" | |
| add_if outputs/run_artifacts/v4/evals/eval_clarify-rl-grpo-qwen3-0-6b_n50_v4.json "0.6B GRPO (Run 1)" | |
| add_if outputs/run_artifacts/v4/evals/eval_qwen3-1.7b_n50_v4.json "1.7B base" | |
| add_if outputs/run_artifacts/1.7B/evals/eval_clarify-rl-grpo-qwen3-1-7b_n50.json "1.7B GRPO no-KL (Run 2)" | |
| add_if outputs/run_artifacts/4B-base/evals/eval_qwen3-4b_qwen3-4b-base_n50_v4.json "4B base" | |
| # After Run 4 finishes its first eval, refresh_all_plots.sh will pick up the new | |
| # JSON automatically; until then it's silently skipped. | |
| for f in outputs/run_artifacts/1.7B-KL/evals/eval_*_n50_v4.json; do | |
| [ -f "$f" ] && add_if "$f" "1.7B GRPO +KL (Run 4)" | |
| done | |
| for f in outputs/run_artifacts/4B/evals/eval_*_n50_v4.json; do | |
| [ -f "$f" ] && add_if "$f" "4B GRPO (Run 3)" | |
| done | |
| add_if outputs/eval_qwen3-4b-instruct_n50_v4.json "4B-instruct" | |
| LOG_FLAGS=() | |
| [ -f outputs/run1_artifacts/log_history.json ] && LOG_FLAGS+=(--log-history "0.6B GRPO (Run 1, beta=0)=outputs/run1_artifacts/log_history.json") | |
| [ -f outputs/run_artifacts/1.7B/log_history.json ] && LOG_FLAGS+=(--log-history "1.7B GRPO (Run 2, beta=0)=outputs/run_artifacts/1.7B/log_history.json") | |
| [ -f outputs/run2_artifacts/log_history_partial.json ] && [ ! -f outputs/run_artifacts/1.7B/log_history.json ] && LOG_FLAGS+=(--log-history "1.7B GRPO (Run 2, in progress)=outputs/run2_artifacts/log_history_partial.json") | |
| [ -f outputs/run_artifacts/4B/log_history.json ] && LOG_FLAGS+=(--log-history "4B GRPO (Run 3, beta=0)=outputs/run_artifacts/4B/log_history.json") | |
| [ -f outputs/run_artifacts/1.7B-KL/log_history.json ] && LOG_FLAGS+=(--log-history "1.7B GRPO (Run 4, beta=0.2)=outputs/run_artifacts/1.7B-KL/log_history.json") | |
| # Until each run's final log_history.json is pushed to its repo, fall back to | |
| # the partial JSON that monitor_training scrapes from live job logs. This way | |
| # the reward curves stay current even mid-training. | |
| [ -f outputs/run4_artifacts/log_history_partial.json ] && [ ! -f outputs/run_artifacts/1.7B-KL/log_history.json ] && LOG_FLAGS+=(--log-history "1.7B GRPO (Run 4, beta=0.2 in-progress)=outputs/run4_artifacts/log_history_partial.json") | |
| [ -f outputs/run3_artifacts/log_history_partial.json ] && [ ! -f outputs/run_artifacts/4B/log_history.json ] && LOG_FLAGS+=(--log-history "4B GRPO (Run 3, beta=0 in-progress)=outputs/run3_artifacts/log_history_partial.json") | |
| echo | |
| echo "Running make_plots.py:" | |
| python scripts/make_plots.py "${LOG_FLAGS[@]}" "${EVAL_FLAGS[@]}" --out-dir plots | |
| # Also build the hackathon-narrative plots: per-family same-base delta and | |
| # the 4-run summary table. These need richer logic than make_plots.py supports, | |
| # so they live in compare_runs.py. | |
| echo | |
| echo "Running compare_runs.py:" | |
| python scripts/compare_runs.py --out-dir plots | |