Spaces:
Running
Running
File size: 5,194 Bytes
099bec8 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 | #!/usr/bin/env bash
# Fire ALL production runs in parallel across multiple HF accounts.
#
# Each account fires one job. The plan defaults to 3 simultaneous runs
# (matching the 3 Qwen3 sizes), with optional 4th and 5th insurance runs.
#
# Required env vars (one HF_TOKEN per account):
# HF_TOKEN_1 token for account 1 (drives Qwen3-0.6B)
# HF_TOKEN_2 token for account 2 (drives Qwen3-1.7B)
# HF_TOKEN_3 token for account 3 (drives Qwen3-4B)
# HF_TOKEN_4 (optional) token for account 4 β drives insurance run if INSURANCE=1
#
# Optional env:
# ENV_BASE_URL default: https://agarwalanu3103-clarify-rl.hf.space
# INSURANCE "1" β also launch a backup Qwen3-1.7B run (different seed)
# DRY_RUN "1" β print all commands but do not launch anything
#
# Usage:
# HF_TOKEN_1=hf_a HF_TOKEN_2=hf_b HF_TOKEN_3=hf_c ./scripts/launch_all.sh
#
# Recommended budget for the default plan (without insurance): ~$70
# With INSURANCE=1: ~$95
# Either way well within the $120 cap.
set -euo pipefail
: "${HF_TOKEN_1:?HF_TOKEN_1 required (account 1 β Qwen3-0.6B)}"
: "${HF_TOKEN_2:?HF_TOKEN_2 required (account 2 β Qwen3-1.7B)}"
: "${HF_TOKEN_3:?HF_TOKEN_3 required (account 3 β Qwen3-4B)}"
: "${ENV_BASE_URL:=https://agarwalanu3103-clarify-rl.hf.space}"
: "${INSURANCE:=0}"
: "${DRY_RUN:=0}"
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
LAUNCHER="$SCRIPT_DIR/launch_hf_job.sh"
cat <<EOF
=========================================================================
ClarifyRL multi-account parallel launch
=========================================================================
Env Space: $ENV_BASE_URL
Insurance: $INSURANCE
Dry run: $DRY_RUN
=========================================================================
EOF
# ----------------------------------------------------------------------
# Plan (revised after `hf jobs hardware` 2026-04-25):
# Account 1: Qwen3-0.6B / a10g-large / 500 steps / num_gen=4 / ~$7.50 (5h * $1.50)
# Account 2: Qwen3-1.7B / a100-large / 400 steps / num_gen=8 / ~$12.50 (5h * $2.50)
# Account 3: Qwen3-4B / h200 / 250 steps / num_gen=8 / ~$25 (5h * $5.00)
# Account 4: Qwen3-1.7B / a100-large / 400 steps / num_gen=8 / seed=84 / ~$12.50 (insurance)
# Total without insurance: ~$45 With insurance: ~$57.50
# Well within the $120 cap β leaves headroom for retries / longer runs / second pass.
# ----------------------------------------------------------------------
run() {
local label="$1"; shift
echo
echo "ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ"
echo " Launching: $label"
echo "ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ"
if [ "$DRY_RUN" = "1" ]; then
DRY_RUN=1 "$@"
else
"$@"
fi
}
# Account 1 β 0.6B
HF_TOKEN="$HF_TOKEN_1" \
ENV_BASE_URL="$ENV_BASE_URL" \
SEED=42 \
run "Account 1: Qwen3-0.6B / a10g-large / 500 steps" \
"$LAUNCHER" Qwen/Qwen3-0.6B a10g-large 500 &
PID1=$!
# Account 2 β 1.7B
HF_TOKEN="$HF_TOKEN_2" \
ENV_BASE_URL="$ENV_BASE_URL" \
SEED=42 \
run "Account 2: Qwen3-1.7B / a100-large / 400 steps" \
"$LAUNCHER" Qwen/Qwen3-1.7B a100-large 400 &
PID2=$!
# Account 3 β 4B
HF_TOKEN="$HF_TOKEN_3" \
ENV_BASE_URL="$ENV_BASE_URL" \
SEED=42 \
run "Account 3: Qwen3-4B / h200 / 250 steps" \
"$LAUNCHER" Qwen/Qwen3-4B h200 250 &
PID3=$!
PIDS=("$PID1" "$PID2" "$PID3")
LABELS=("0.6B" "1.7B" "4B")
# Optional 4th insurance run
if [ "$INSURANCE" = "1" ]; then
: "${HF_TOKEN_4:?HF_TOKEN_4 required when INSURANCE=1}"
HF_TOKEN="$HF_TOKEN_4" \
ENV_BASE_URL="$ENV_BASE_URL" \
SEED=84 \
OUTPUT_DIR="clarify-rl-grpo-qwen3-1-7b-seed84" \
run "Account 4: Qwen3-1.7B / a100-large / 400 steps / seed=84 (insurance)" \
"$LAUNCHER" Qwen/Qwen3-1.7B a100-large 400 &
PIDS+=("$!")
LABELS+=("1.7B-seed84")
fi
# Wait for all launchers to exit. Each launcher submits the job and returns
# fairly fast β the actual training happens server-side on HF.
echo
echo "Waiting for all launches to complete (this only waits for *submission*,"
echo "not for the training itself β that runs server-side on HF Jobs)..."
echo
declare -i FAILED=0
for i in "${!PIDS[@]}"; do
if wait "${PIDS[$i]}"; then
echo "[OK] ${LABELS[$i]} submitted"
else
echo "[FAIL] ${LABELS[$i]} submission exited non-zero"
FAILED=$((FAILED + 1))
fi
done
echo
echo "====================================================================="
if [ "$FAILED" = "0" ]; then
echo "All ${#PIDS[@]} jobs submitted. Track them at:"
echo " https://huggingface.co/jobs (per account)"
echo " https://huggingface.co/spaces (trackio dashboards)"
else
echo "$FAILED of ${#PIDS[@]} submissions failed β check output above."
exit 1
fi
echo "====================================================================="
|