File size: 4,647 Bytes
099bec8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
#!/usr/bin/env bash
# Launch a vLLM-powered eval as a HF Job for a trained ClarifyRL checkpoint.
#
# Usage:
#   HF_TOKEN=hf_xxx ./scripts/launch_eval_job.sh \
#       --model agarwalanu3103/clarify-rl-grpo-qwen3-0-6b \
#       --flavor a10g-small \
#       --limit 50
#
# Or as positional shortcuts:
#   HF_TOKEN=hf_xxx ./scripts/launch_eval_job.sh agarwalanu3103/clarify-rl-grpo-qwen3-0-6b a10g-small 50
#
# This works around the fact that HF Inference Router does not auto-warm
# fine-tuned community uploads — vllm must be hosted ourselves. We use the
# cheapest GPU that fits the model: a10g-small (24 GB) for ≤4B, a10g-large
# for 7-8B.
#
# Environment:
#   HF_TOKEN          (required) write token of the account hosting the eval.
#   ENV_BASE_URL      env Space URL (default: agarwalanu3103-clarify-rl).
#   PUSH_TO_REPO      override push target (default = MODEL).
#   EVAL_LABEL        suffix for output filename (default n${LIMIT}).
#   GPU_MEM_UTIL      vLLM GPU mem util (default 0.85).
#   TIMEOUT           HF Jobs timeout (default 1h).
#   IMAGE             docker image override.
#
# Example multi-checkpoint sweep:
#   for m in clarify-rl-grpo-qwen3-0-6b clarify-rl-grpo-qwen3-1-7b; do
#     HF_TOKEN=$HF_TOKEN ./scripts/launch_eval_job.sh agarwalanu3103/$m a10g-small 50
#   done

set -euo pipefail

MODEL=""
FLAVOR="a10g-small"
LIMIT="50"

if [ "$#" -ge 1 ] && [ "${1:0:2}" != "--" ]; then
    MODEL="${1}"
    [ "$#" -ge 2 ] && FLAVOR="${2}"
    [ "$#" -ge 3 ] && LIMIT="${3}"
else
    while [ "$#" -gt 0 ]; do
        case "$1" in
            --model)   MODEL="$2"; shift 2;;
            --flavor)  FLAVOR="$2"; shift 2;;
            --limit)   LIMIT="$2"; shift 2;;
            --image)   IMAGE="$2"; shift 2;;
            --timeout) TIMEOUT="$2"; shift 2;;
            -h|--help)
                grep '^#' "$0" | sed 's/^# \{0,1\}//'
                exit 0;;
            *)
                echo "Unknown arg: $1" >&2
                exit 1;;
        esac
    done
fi

: "${MODEL:?MODEL is required (e.g. agarwalanu3103/clarify-rl-grpo-qwen3-0-6b)}"
: "${HF_TOKEN:?HF_TOKEN is required}"
: "${ENV_BASE_URL:=https://agarwalanu3103-clarify-rl.hf.space}"
: "${PUSH_TO_REPO:=$MODEL}"
: "${EVAL_LABEL:=n${LIMIT}}"
: "${GPU_MEM_UTIL:=0.85}"
: "${TIMEOUT:=1h}"
: "${IMAGE:=}"

SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
EVAL_SCRIPT="$SCRIPT_DIR/scripts/eval_with_vllm.py"
RUN_EVAL="$SCRIPT_DIR/scripts/run_eval.py"
INFERENCE_PY="$SCRIPT_DIR/inference.py"
SCENARIOS="$SCRIPT_DIR/scenarios/eval_held_out.json"

for f in "$EVAL_SCRIPT" "$RUN_EVAL" "$INFERENCE_PY" "$SCENARIOS"; do
    [ -f "$f" ] || { echo "ERROR: missing $f" >&2; exit 1; }
done

cat <<EOF
=========================================================================
ClarifyRL vLLM eval HF Jobs launcher
=========================================================================
  Model:              $MODEL
  Flavor:             $FLAVOR
  Limit:              $LIMIT
  Push target:        $PUSH_TO_REPO
  Eval label:         $EVAL_LABEL
  Env base URL:       $ENV_BASE_URL
  GPU mem util:       $GPU_MEM_UTIL
  Timeout:            $TIMEOUT
  Image:              ${IMAGE:-<HF Jobs default uv-python>}
=========================================================================
EOF

CMD=(
    hf jobs uv run
    --flavor "$FLAVOR"
    --timeout "$TIMEOUT"
    --secrets "HF_TOKEN=$HF_TOKEN"
    --token "$HF_TOKEN"
    -e "MODEL_NAME=$MODEL"
    -e "ENV_BASE_URL=$ENV_BASE_URL"
    -e "PUSH_TO_REPO=$PUSH_TO_REPO"
    -e "LIMIT=$LIMIT"
    -e "EVAL_LABEL=$EVAL_LABEL"
    -e "GPU_MEM_UTIL=$GPU_MEM_UTIL"
    -e "PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True"
    -e "VLLM_USE_V1=1"
)

if [ -n "$IMAGE" ]; then
    CMD+=(--image "$IMAGE")
fi

: "${DETACH:=1}"
if [ "$DETACH" = "1" ]; then
    CMD+=(-d)
fi

# vLLM + openai (HTTP client used by run_eval.py via inference.py) +
# websockets (env Space connection) + huggingface_hub (Hub upload).
# We DO NOT pull `trl` here — eval is purely inference + HTTP.
CMD+=(
    --with "vllm"
    --with "openai>=1.40.0"
    --with "websockets>=12.0"
    --with "jmespath"
    --with "huggingface_hub"
    --with "truststore"
    "$EVAL_SCRIPT"
)

# Prefer the venv hf binary so SSL truststore patch applies.
VENV_HF="$SCRIPT_DIR/.venv/bin/hf"
if [ -x "$VENV_HF" ]; then
    HF_BIN="$VENV_HF"
elif command -v hf >/dev/null 2>&1; then
    HF_BIN="$(command -v hf)"
else
    echo "ERROR: 'hf' CLI not found." >&2
    exit 1
fi
CMD[0]="$HF_BIN"

if [ "${DRY_RUN:-0}" = "1" ]; then
    echo "DRY_RUN=1 — would run:"
    printf '  %q\n' "${CMD[@]}"
    exit 0
fi

echo "Launching with: $HF_BIN"
echo
"${CMD[@]}"