Spaces:

agarwalanu3103
/

clarify-rl

Sleeping

File size: 6,078 Bytes

099bec8

#!/usr/bin/env bash
# Download all eval JSONs + log_history files from the Hub, then regenerate
# the 6 submission plots with the same-base before/after comparison.
#
# Usage:
#   HF_TOKEN=hf_...   (Acct 1: agarwalanu3103) used by default
#   HF_TOKEN_KANAN=hf_...  (Acct 2: Kanan2005, owns Run 3 repo) — optional
#   HF_TOKEN_MNIT=hf_...   (Acct 3: 2022uec1542, owns Run 4 repo) — optional
#   ./scripts/refresh_all_plots.sh
#
# Repos covered:
#   Run 1: agarwalanu3103/clarify-rl-grpo-qwen3-0-6b   — 0.6B GRPO (beta=0,    LR=1e-6)
#   Run 2: agarwalanu3103/clarify-rl-grpo-qwen3-1-7b   — 1.7B GRPO (beta=0,    LR=1e-6)
#   Run 3: Kanan2005/clarify-rl-grpo-qwen3-4b          — 4B   GRPO (beta=0,    LR=1e-6)
#   Run 4: 2022uec1542/clarify-rl-grpo-qwen3-1-7b      — 1.7B GRPO (beta=0.2,  LR=5e-7) — KL-anchored

set -euo pipefail

: "${HF_TOKEN:?HF_TOKEN required (Acct 1: agarwalanu3103)}"

OUT=outputs/run_artifacts
mkdir -p "$OUT" plots

source .venv/bin/activate

# Forward all three tokens into the python pull so each repo can be reached
# with the right credentials. Kanan/MNIT are optional — we skip those repos
# silently if not provided.
HF_TOKEN_KANAN="${HF_TOKEN_KANAN:-}" \
HF_TOKEN_MNIT="${HF_TOKEN_MNIT:-}" \
HF_TOKEN_AGARWAL="${HF_TOKEN}" \
python <<'PY'
import os
import json
import truststore; truststore.inject_into_ssl()
from huggingface_hub import HfApi, hf_hub_download
from pathlib import Path

out = Path("outputs/run_artifacts")
out.mkdir(parents=True, exist_ok=True)

# Map: short label → (repo_id, env-var name holding the token for that repo)
REPOS = {
    "0.6B":         ("agarwalanu3103/clarify-rl-grpo-qwen3-0-6b", "HF_TOKEN_AGARWAL"),
    "1.7B":         ("agarwalanu3103/clarify-rl-grpo-qwen3-1-7b", "HF_TOKEN_AGARWAL"),
    "4B":           ("Kanan2005/clarify-rl-grpo-qwen3-4b",        "HF_TOKEN_KANAN"),
    "1.7B-KL":      ("2022uec1542/clarify-rl-grpo-qwen3-1-7b",    "HF_TOKEN_MNIT"),
}

for size, (repo, token_var) in REPOS.items():
    token = os.environ.get(token_var) or ""
    if not token:
        print(f"[skip] {size} {repo}: {token_var} not set")
        continue
    api = HfApi(token=token)
    try:
        files = api.list_repo_files(repo)
    except Exception as exc:
        print(f"[skip] {repo}: {exc}")
        continue

    for f in files:
        if f.startswith("evals/") or f in ("log_history.json", "training_summary.json"):
            try:
                local = hf_hub_download(
                    repo_id=repo,
                    filename=f,
                    token=token,
                    local_dir=str(out / size),
                )
                print(f"[ok]  {size}/{f}")
            except Exception as exc:
                print(f"[err] {size}/{f}: {exc}")
PY

echo
echo "Files now under outputs/run_artifacts:"
find outputs/run_artifacts -name '*.json' | sort

# Build the eval-flag list for whatever was actually downloaded.
EVAL_FLAGS=()
add_if() {
    if [ -f "$1" ]; then
        EVAL_FLAGS+=(--eval "$2=$1")
        echo "  + ${2}: ${1}"
    fi
}

echo
echo "Building eval list:"
add_if outputs/eval_policy_v4.json                                                  "policy (deterministic)"
add_if outputs/run_artifacts/v4/evals/eval_qwen3-0.6b_n50_v4.json                   "0.6B base"
add_if outputs/run_artifacts/v4/evals/eval_clarify-rl-grpo-qwen3-0-6b_n50_v4.json   "0.6B GRPO (Run 1)"
add_if outputs/run_artifacts/v4/evals/eval_qwen3-1.7b_n50_v4.json                   "1.7B base"
add_if outputs/run_artifacts/1.7B/evals/eval_clarify-rl-grpo-qwen3-1-7b_n50.json    "1.7B GRPO no-KL (Run 2)"
add_if outputs/run_artifacts/4B-base/evals/eval_qwen3-4b_qwen3-4b-base_n50_v4.json  "4B base"
# After Run 4 finishes its first eval, refresh_all_plots.sh will pick up the new
# JSON automatically; until then it's silently skipped.
for f in outputs/run_artifacts/1.7B-KL/evals/eval_*_n50_v4.json; do
    [ -f "$f" ] && add_if "$f" "1.7B GRPO +KL (Run 4)"
done
for f in outputs/run_artifacts/4B/evals/eval_*_n50_v4.json; do
    [ -f "$f" ] && add_if "$f" "4B GRPO (Run 3)"
done
add_if outputs/eval_qwen3-4b-instruct_n50_v4.json                                    "4B-instruct"

LOG_FLAGS=()
[ -f outputs/run1_artifacts/log_history.json ] && LOG_FLAGS+=(--log-history "0.6B GRPO (Run 1, beta=0)=outputs/run1_artifacts/log_history.json")
[ -f outputs/run_artifacts/1.7B/log_history.json ] && LOG_FLAGS+=(--log-history "1.7B GRPO (Run 2, beta=0)=outputs/run_artifacts/1.7B/log_history.json")
[ -f outputs/run2_artifacts/log_history_partial.json ] && [ ! -f outputs/run_artifacts/1.7B/log_history.json ] && LOG_FLAGS+=(--log-history "1.7B GRPO (Run 2, in progress)=outputs/run2_artifacts/log_history_partial.json")
[ -f outputs/run_artifacts/4B/log_history.json ] && LOG_FLAGS+=(--log-history "4B GRPO (Run 3, beta=0)=outputs/run_artifacts/4B/log_history.json")
[ -f outputs/run_artifacts/1.7B-KL/log_history.json ] && LOG_FLAGS+=(--log-history "1.7B GRPO (Run 4, beta=0.2)=outputs/run_artifacts/1.7B-KL/log_history.json")
# Until each run's final log_history.json is pushed to its repo, fall back to
# the partial JSON that monitor_training scrapes from live job logs. This way
# the reward curves stay current even mid-training.
[ -f outputs/run4_artifacts/log_history_partial.json ] && [ ! -f outputs/run_artifacts/1.7B-KL/log_history.json ] && LOG_FLAGS+=(--log-history "1.7B GRPO (Run 4, beta=0.2 in-progress)=outputs/run4_artifacts/log_history_partial.json")
[ -f outputs/run3_artifacts/log_history_partial.json ] && [ ! -f outputs/run_artifacts/4B/log_history.json ] && LOG_FLAGS+=(--log-history "4B GRPO (Run 3, beta=0 in-progress)=outputs/run3_artifacts/log_history_partial.json")

echo
echo "Running make_plots.py:"
python scripts/make_plots.py "${LOG_FLAGS[@]}" "${EVAL_FLAGS[@]}" --out-dir plots

# Also build the hackathon-narrative plots: per-family same-base delta and
# the 4-run summary table. These need richer logic than make_plots.py supports,
# so they live in compare_runs.py.
echo
echo "Running compare_runs.py:"
python scripts/compare_runs.py --out-dir plots