File size: 4,302 Bytes
099bec8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
#!/usr/bin/env bash
# Run all evals after the production training jobs finish.
#
# This script orchestrates the post-training eval sweep:
#   1. Policy baseline (deterministic, no LLM)
#   2. Base model eval per size (Qwen3-0.6B / 1.7B / 4B, untrained)
#   3. Trained model eval per size (3 trained checkpoints)
#
# Outputs go to outputs/eval_<model>_<base|trained>.json — exactly the
# layout consumed by scripts/make_plots.py.
#
# Required env (one HF write-token with read access to the trained model repos):
#   HF_TOKEN          token for downloading the trained models from HF Hub
#
# Optional env:
#   ENV_BASE_URL      default https://agarwalanu3103-clarify-rl.hf.space
#   API_BASE_URL      default https://router.huggingface.co/v1 (HF Inference Router)
#   LIMIT             max scenarios to evaluate (default 100, set to 300 for full)
#   TIMEOUT_S         per-scenario timeout (default 60)
#   SKIP_POLICY       "1" to skip the policy baseline (already have it)
#   SKIP_BASE         "1" to skip base-model evals
#   SKIP_TRAINED      "1" to skip trained-model evals
#
# Usage:
#   HF_TOKEN=hf_xxx ./scripts/run_post_train_eval.sh
#
# Trained model repo names (these are the OUTPUT_DIRs from launch_all.sh):
#   <username>/clarify-rl-grpo-qwen3-0-6b
#   <username>/clarify-rl-grpo-qwen3-1-7b
#   <username>/clarify-rl-grpo-qwen3-4b
#
# Set MODEL_0_6B / MODEL_1_7B / MODEL_4B env vars if your usernames differ.

set -euo pipefail

: "${HF_TOKEN:?HF_TOKEN required (read access to trained model repos)}"
: "${ENV_BASE_URL:=https://agarwalanu3103-clarify-rl.hf.space}"
: "${API_BASE_URL:=https://router.huggingface.co/v1}"
: "${LIMIT:=100}"
: "${TIMEOUT_S:=60}"
: "${SKIP_POLICY:=0}"
: "${SKIP_BASE:=0}"
: "${SKIP_TRAINED:=0}"

# Defaults assume agarwalanu3103 owns the 0.6B run.
: "${MODEL_0_6B:=agarwalanu3103/clarify-rl-grpo-qwen3-0-6b}"
: "${MODEL_1_7B:=agarwalanu3103/clarify-rl-grpo-qwen3-1-7b}"
: "${MODEL_4B:=agarwalanu3103/clarify-rl-grpo-qwen3-4b}"

OUT_DIR="outputs"
mkdir -p "$OUT_DIR"

cat <<EOF
=========================================================================
ClarifyRL post-training eval sweep
=========================================================================
  Env Space:     $ENV_BASE_URL
  API Base URL:  $API_BASE_URL
  Limit:         $LIMIT scenarios
  Timeout:       ${TIMEOUT_S}s per scenario
  Trained 0.6B:  $MODEL_0_6B
  Trained 1.7B:  $MODEL_1_7B
  Trained 4B:    $MODEL_4B
  Output dir:    $OUT_DIR
=========================================================================
EOF

run_eval() {
    local mode="$1"
    local out_path="$2"
    local model="${3:-}"

    if [ -f "$out_path" ]; then
        echo "[SKIP] $out_path already exists (delete to re-run)"
        return 0
    fi

    echo
    echo "▶ Eval: mode=$mode out=$out_path model=${model:-N/A}"
    if [ "$mode" = "policy" ]; then
        ENV_BASE_URL="$ENV_BASE_URL" \
        python3 scripts/run_eval.py \
            --mode policy \
            --out "$out_path" \
            --limit "$LIMIT" \
            --timeout "$TIMEOUT_S"
    else
        MODEL_NAME="$model" \
        API_BASE_URL="$API_BASE_URL" \
        HF_TOKEN="$HF_TOKEN" \
        ENV_BASE_URL="$ENV_BASE_URL" \
        python3 scripts/run_eval.py \
            --mode api \
            --out "$out_path" \
            --limit "$LIMIT" \
            --timeout "$TIMEOUT_S"
    fi
}

# 1. Policy baseline
if [ "$SKIP_POLICY" != "1" ]; then
    run_eval policy "$OUT_DIR/eval_policy.json"
fi

# 2. Base-model evals (untrained Qwen3 family)
if [ "$SKIP_BASE" != "1" ]; then
    run_eval api "$OUT_DIR/eval_qwen3-0.6b_base.json" "Qwen/Qwen3-0.6B"
    run_eval api "$OUT_DIR/eval_qwen3-1.7b_base.json" "Qwen/Qwen3-1.7B"
    run_eval api "$OUT_DIR/eval_qwen3-4b_base.json"   "Qwen/Qwen3-4B"
fi

# 3. Trained-model evals
if [ "$SKIP_TRAINED" != "1" ]; then
    run_eval api "$OUT_DIR/eval_qwen3-0.6b_trained.json" "$MODEL_0_6B"
    run_eval api "$OUT_DIR/eval_qwen3-1.7b_trained.json" "$MODEL_1_7B"
    run_eval api "$OUT_DIR/eval_qwen3-4b_trained.json"   "$MODEL_4B"
fi

echo
echo "====================================================================="
echo "All evals done. Now run scripts/make_plots.py to generate PNGs."
echo "====================================================================="
ls -la "$OUT_DIR"