Spaces:
Running
Running
| # Fire ALL production runs in parallel across multiple HF accounts. | |
| # | |
| # Each account fires one job. The plan defaults to 3 simultaneous runs | |
| # (matching the 3 Qwen3 sizes), with optional 4th and 5th insurance runs. | |
| # | |
| # Required env vars (one HF_TOKEN per account): | |
| # HF_TOKEN_1 token for account 1 (drives Qwen3-0.6B) | |
| # HF_TOKEN_2 token for account 2 (drives Qwen3-1.7B) | |
| # HF_TOKEN_3 token for account 3 (drives Qwen3-4B) | |
| # HF_TOKEN_4 (optional) token for account 4 β drives insurance run if INSURANCE=1 | |
| # | |
| # Optional env: | |
| # ENV_BASE_URL default: https://agarwalanu3103-clarify-rl.hf.space | |
| # INSURANCE "1" β also launch a backup Qwen3-1.7B run (different seed) | |
| # DRY_RUN "1" β print all commands but do not launch anything | |
| # | |
| # Usage: | |
| # HF_TOKEN_1=hf_a HF_TOKEN_2=hf_b HF_TOKEN_3=hf_c ./scripts/launch_all.sh | |
| # | |
| # Recommended budget for the default plan (without insurance): ~$70 | |
| # With INSURANCE=1: ~$95 | |
| # Either way well within the $120 cap. | |
| set -euo pipefail | |
| : "${HF_TOKEN_1:?HF_TOKEN_1 required (account 1 β Qwen3-0.6B)}" | |
| : "${HF_TOKEN_2:?HF_TOKEN_2 required (account 2 β Qwen3-1.7B)}" | |
| : "${HF_TOKEN_3:?HF_TOKEN_3 required (account 3 β Qwen3-4B)}" | |
| : "${ENV_BASE_URL:=https://agarwalanu3103-clarify-rl.hf.space}" | |
| : "${INSURANCE:=0}" | |
| : "${DRY_RUN:=0}" | |
| SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" | |
| LAUNCHER="$SCRIPT_DIR/launch_hf_job.sh" | |
| cat <<EOF | |
| ========================================================================= | |
| ClarifyRL multi-account parallel launch | |
| ========================================================================= | |
| Env Space: $ENV_BASE_URL | |
| Insurance: $INSURANCE | |
| Dry run: $DRY_RUN | |
| ========================================================================= | |
| EOF | |
| # ---------------------------------------------------------------------- | |
| # Plan (revised after `hf jobs hardware` 2026-04-25): | |
| # Account 1: Qwen3-0.6B / a10g-large / 500 steps / num_gen=4 / ~$7.50 (5h * $1.50) | |
| # Account 2: Qwen3-1.7B / a100-large / 400 steps / num_gen=8 / ~$12.50 (5h * $2.50) | |
| # Account 3: Qwen3-4B / h200 / 250 steps / num_gen=8 / ~$25 (5h * $5.00) | |
| # Account 4: Qwen3-1.7B / a100-large / 400 steps / num_gen=8 / seed=84 / ~$12.50 (insurance) | |
| # Total without insurance: ~$45 With insurance: ~$57.50 | |
| # Well within the $120 cap β leaves headroom for retries / longer runs / second pass. | |
| # ---------------------------------------------------------------------- | |
| run() { | |
| local label="$1"; shift | |
| echo | |
| echo "ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ" | |
| echo " Launching: $label" | |
| echo "ββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ" | |
| if [ "$DRY_RUN" = "1" ]; then | |
| DRY_RUN=1 "$@" | |
| else | |
| "$@" | |
| fi | |
| } | |
| # Account 1 β 0.6B | |
| HF_TOKEN="$HF_TOKEN_1" \ | |
| ENV_BASE_URL="$ENV_BASE_URL" \ | |
| SEED=42 \ | |
| run "Account 1: Qwen3-0.6B / a10g-large / 500 steps" \ | |
| "$LAUNCHER" Qwen/Qwen3-0.6B a10g-large 500 & | |
| PID1=$! | |
| # Account 2 β 1.7B | |
| HF_TOKEN="$HF_TOKEN_2" \ | |
| ENV_BASE_URL="$ENV_BASE_URL" \ | |
| SEED=42 \ | |
| run "Account 2: Qwen3-1.7B / a100-large / 400 steps" \ | |
| "$LAUNCHER" Qwen/Qwen3-1.7B a100-large 400 & | |
| PID2=$! | |
| # Account 3 β 4B | |
| HF_TOKEN="$HF_TOKEN_3" \ | |
| ENV_BASE_URL="$ENV_BASE_URL" \ | |
| SEED=42 \ | |
| run "Account 3: Qwen3-4B / h200 / 250 steps" \ | |
| "$LAUNCHER" Qwen/Qwen3-4B h200 250 & | |
| PID3=$! | |
| PIDS=("$PID1" "$PID2" "$PID3") | |
| LABELS=("0.6B" "1.7B" "4B") | |
| # Optional 4th insurance run | |
| if [ "$INSURANCE" = "1" ]; then | |
| : "${HF_TOKEN_4:?HF_TOKEN_4 required when INSURANCE=1}" | |
| HF_TOKEN="$HF_TOKEN_4" \ | |
| ENV_BASE_URL="$ENV_BASE_URL" \ | |
| SEED=84 \ | |
| OUTPUT_DIR="clarify-rl-grpo-qwen3-1-7b-seed84" \ | |
| run "Account 4: Qwen3-1.7B / a100-large / 400 steps / seed=84 (insurance)" \ | |
| "$LAUNCHER" Qwen/Qwen3-1.7B a100-large 400 & | |
| PIDS+=("$!") | |
| LABELS+=("1.7B-seed84") | |
| fi | |
| # Wait for all launchers to exit. Each launcher submits the job and returns | |
| # fairly fast β the actual training happens server-side on HF. | |
| echo | |
| echo "Waiting for all launches to complete (this only waits for *submission*," | |
| echo "not for the training itself β that runs server-side on HF Jobs)..." | |
| echo | |
| declare -i FAILED=0 | |
| for i in "${!PIDS[@]}"; do | |
| if wait "${PIDS[$i]}"; then | |
| echo "[OK] ${LABELS[$i]} submitted" | |
| else | |
| echo "[FAIL] ${LABELS[$i]} submission exited non-zero" | |
| FAILED=$((FAILED + 1)) | |
| fi | |
| done | |
| echo | |
| echo "=====================================================================" | |
| if [ "$FAILED" = "0" ]; then | |
| echo "All ${#PIDS[@]} jobs submitted. Track them at:" | |
| echo " https://huggingface.co/jobs (per account)" | |
| echo " https://huggingface.co/spaces (trackio dashboards)" | |
| else | |
| echo "$FAILED of ${#PIDS[@]} submissions failed β check output above." | |
| exit 1 | |
| fi | |
| echo "=====================================================================" | |