#!/usr/bin/env bash # Launch a vLLM-powered eval as a HF Job for a trained ClarifyRL checkpoint. # # Usage: # HF_TOKEN=hf_xxx ./scripts/launch_eval_job.sh \ # --model agarwalanu3103/clarify-rl-grpo-qwen3-0-6b \ # --flavor a10g-small \ # --limit 50 # # Or as positional shortcuts: # HF_TOKEN=hf_xxx ./scripts/launch_eval_job.sh agarwalanu3103/clarify-rl-grpo-qwen3-0-6b a10g-small 50 # # This works around the fact that HF Inference Router does not auto-warm # fine-tuned community uploads — vllm must be hosted ourselves. We use the # cheapest GPU that fits the model: a10g-small (24 GB) for ≤4B, a10g-large # for 7-8B. # # Environment: # HF_TOKEN (required) write token of the account hosting the eval. # ENV_BASE_URL env Space URL (default: agarwalanu3103-clarify-rl). # PUSH_TO_REPO override push target (default = MODEL). # EVAL_LABEL suffix for output filename (default n${LIMIT}). # GPU_MEM_UTIL vLLM GPU mem util (default 0.85). # TIMEOUT HF Jobs timeout (default 1h). # IMAGE docker image override. # # Example multi-checkpoint sweep: # for m in clarify-rl-grpo-qwen3-0-6b clarify-rl-grpo-qwen3-1-7b; do # HF_TOKEN=$HF_TOKEN ./scripts/launch_eval_job.sh agarwalanu3103/$m a10g-small 50 # done set -euo pipefail MODEL="" FLAVOR="a10g-small" LIMIT="50" if [ "$#" -ge 1 ] && [ "${1:0:2}" != "--" ]; then MODEL="${1}" [ "$#" -ge 2 ] && FLAVOR="${2}" [ "$#" -ge 3 ] && LIMIT="${3}" else while [ "$#" -gt 0 ]; do case "$1" in --model) MODEL="$2"; shift 2;; --flavor) FLAVOR="$2"; shift 2;; --limit) LIMIT="$2"; shift 2;; --image) IMAGE="$2"; shift 2;; --timeout) TIMEOUT="$2"; shift 2;; -h|--help) grep '^#' "$0" | sed 's/^# \{0,1\}//' exit 0;; *) echo "Unknown arg: $1" >&2 exit 1;; esac done fi : "${MODEL:?MODEL is required (e.g. agarwalanu3103/clarify-rl-grpo-qwen3-0-6b)}" : "${HF_TOKEN:?HF_TOKEN is required}" : "${ENV_BASE_URL:=https://agarwalanu3103-clarify-rl.hf.space}" : "${PUSH_TO_REPO:=$MODEL}" : "${EVAL_LABEL:=n${LIMIT}}" : "${GPU_MEM_UTIL:=0.85}" : "${TIMEOUT:=1h}" : "${IMAGE:=}" SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" EVAL_SCRIPT="$SCRIPT_DIR/scripts/eval_with_vllm.py" RUN_EVAL="$SCRIPT_DIR/scripts/run_eval.py" INFERENCE_PY="$SCRIPT_DIR/inference.py" SCENARIOS="$SCRIPT_DIR/scenarios/eval_held_out.json" for f in "$EVAL_SCRIPT" "$RUN_EVAL" "$INFERENCE_PY" "$SCENARIOS"; do [ -f "$f" ] || { echo "ERROR: missing $f" >&2; exit 1; } done cat <} ========================================================================= EOF CMD=( hf jobs uv run --flavor "$FLAVOR" --timeout "$TIMEOUT" --secrets "HF_TOKEN=$HF_TOKEN" --token "$HF_TOKEN" -e "MODEL_NAME=$MODEL" -e "ENV_BASE_URL=$ENV_BASE_URL" -e "PUSH_TO_REPO=$PUSH_TO_REPO" -e "LIMIT=$LIMIT" -e "EVAL_LABEL=$EVAL_LABEL" -e "GPU_MEM_UTIL=$GPU_MEM_UTIL" -e "PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True" -e "VLLM_USE_V1=1" ) if [ -n "$IMAGE" ]; then CMD+=(--image "$IMAGE") fi : "${DETACH:=1}" if [ "$DETACH" = "1" ]; then CMD+=(-d) fi # vLLM + openai (HTTP client used by run_eval.py via inference.py) + # websockets (env Space connection) + huggingface_hub (Hub upload). # We DO NOT pull `trl` here — eval is purely inference + HTTP. CMD+=( --with "vllm" --with "openai>=1.40.0" --with "websockets>=12.0" --with "jmespath" --with "huggingface_hub" --with "truststore" "$EVAL_SCRIPT" ) # Prefer the venv hf binary so SSL truststore patch applies. VENV_HF="$SCRIPT_DIR/.venv/bin/hf" if [ -x "$VENV_HF" ]; then HF_BIN="$VENV_HF" elif command -v hf >/dev/null 2>&1; then HF_BIN="$(command -v hf)" else echo "ERROR: 'hf' CLI not found." >&2 exit 1 fi CMD[0]="$HF_BIN" if [ "${DRY_RUN:-0}" = "1" ]; then echo "DRY_RUN=1 — would run:" printf ' %q\n' "${CMD[@]}" exit 0 fi echo "Launching with: $HF_BIN" echo "${CMD[@]}"