#!/usr/bin/env bash # # SFT orchestrator: waits for pretrain (train.py) to either complete or # reach the 8h budget, then kicks off SFT. # # Behavior: # - Polls for `train.py` process every 60 s # - Exits the wait loop on either: # (a) no train.py process found (pretrain completed naturally), or # (b) 8h elapsed since this script started # - Sends SIGTERM first (graceful — triggers checkpoint-save patch if # applied), waits 30 s, then SIGKILL as fallback # - Invokes `scripts/download_sft_data.py` if shards don't exist # - Launches `scripts/sft.py` in the background with tuned env vars # - Redirects all output to `run_sft.log` # # Re-entrant: safe to invoke even if pretrain has already exited. # Does NOT re-launch if SFT is already running. # # Usage (typical): # cd /home/mikeb/work/feather # nohup bash scripts/sft_orchestrator.sh > orchestrator.log 2>&1 & # disown set -u # error on unset vars, but don't -e (we handle failures explicitly) REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" cd "$REPO_ROOT" || { echo "cannot cd to $REPO_ROOT" >&2; exit 1; } PY="$REPO_ROOT/.venv/bin/python" if [ ! -x "$PY" ]; then echo "[orchestrator] ERROR: python not found at $PY" >&2 exit 1 fi LOG_FILE="$REPO_ROOT/run_sft.log" DATA_LOG="$REPO_ROOT/run_sft_download.log" MAX_WAIT_SECONDS=28800 # 8 hours POLL_INTERVAL=60 GRACEFUL_SHUTDOWN_WAIT=30 log() { echo "[orchestrator $(date -u '+%Y-%m-%dT%H:%M:%SZ')] $*" } # --------------------------------------------------------------------------- # Stage 1: wait for pretrain # --------------------------------------------------------------------------- log "starting; max wait = ${MAX_WAIT_SECONDS}s" # Guard against double-launch if pgrep -f "scripts/sft.py" > /dev/null; then log "SFT is already running — exiting orchestrator to avoid conflict" exit 0 fi T_START=$(date +%s) while true; do NOW=$(date +%s) ELAPSED=$((NOW - T_START)) if [ $ELAPSED -ge $MAX_WAIT_SECONDS ]; then log "reached 8h wait cap (${ELAPSED}s) — will kill pretrain" break fi # Count train.py processes owned by current user (not orchestrator/sft.py) PRETRAIN_PIDS=$(pgrep -u "$USER" -f "train\.py" 2>/dev/null | tr '\n' ' ') # Strip pid of this script if pgrep matched something spurious PRETRAIN_PIDS=$(echo "$PRETRAIN_PIDS" | sed "s/\b$$\b//g" | xargs) if [ -z "$PRETRAIN_PIDS" ]; then log "no train.py process found — pretrain already exited" break fi # Log a status every 10 polls (~10 min) if [ $((ELAPSED / POLL_INTERVAL % 10)) -eq 0 ]; then log "waiting... elapsed=${ELAPSED}s pretrain PIDs: $PRETRAIN_PIDS" fi sleep $POLL_INTERVAL done # --------------------------------------------------------------------------- # Stage 2: kill any remaining pretrain processes # --------------------------------------------------------------------------- PRETRAIN_PIDS=$(pgrep -u "$USER" -f "train\.py" 2>/dev/null | tr '\n' ' ') if [ -n "$PRETRAIN_PIDS" ]; then log "sending SIGTERM to pretrain PIDs: $PRETRAIN_PIDS" for pid in $PRETRAIN_PIDS; do kill -TERM "$pid" 2>/dev/null || true done # Wait for graceful shutdown (gives the checkpoint-save patch time to run) for _ in $(seq 1 $GRACEFUL_SHUTDOWN_WAIT); do REMAINING=$(pgrep -u "$USER" -f "train\.py" 2>/dev/null | tr '\n' ' ') if [ -z "$REMAINING" ]; then break; fi sleep 1 done # Force-kill any stragglers REMAINING=$(pgrep -u "$USER" -f "train\.py" 2>/dev/null | tr '\n' ' ') if [ -n "$REMAINING" ]; then log "force-killing stragglers: $REMAINING" for pid in $REMAINING; do kill -9 "$pid" 2>/dev/null || true done sleep 5 fi fi # --------------------------------------------------------------------------- # Stage 3: ensure SFT data exists # --------------------------------------------------------------------------- META_JSON="$REPO_ROOT/data/sft/meta.json" if [ ! -f "$META_JSON" ]; then log "no SFT data found — running download_sft_data.py" LD_LIBRARY_PATH=/usr/lib/wsl/lib:/usr/local/cuda/lib64 \ "$PY" -u "$REPO_ROOT/scripts/download_sft_data.py" \ > "$DATA_LOG" 2>&1 DL_RC=$? if [ $DL_RC -ne 0 ] || [ ! -f "$META_JSON" ]; then log "ERROR: SFT data download failed (rc=$DL_RC)" log " last 20 lines of $DATA_LOG:" tail -20 "$DATA_LOG" 2>/dev/null | sed 's/^/ /' exit 2 fi log "SFT data ready" else log "SFT data already present at $META_JSON" fi # --------------------------------------------------------------------------- # Stage 4: launch SFT # --------------------------------------------------------------------------- # Guard: if we somehow got here and SFT is now running, don't double-launch. if pgrep -f "scripts/sft.py" > /dev/null; then log "SFT is already running — skipping launch" exit 0 fi log "launching SFT (log -> $LOG_FILE)" export LD_LIBRARY_PATH="/usr/lib/wsl/lib:/usr/local/cuda/lib64${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}" export HYDRA_SFT_TIME_BUDGET="${HYDRA_SFT_TIME_BUDGET:-10800}" export HYDRA_BATCH_SIZE="${HYDRA_BATCH_SIZE:-4}" export HYDRA_TOTAL_BATCH="${HYDRA_TOTAL_BATCH:-8192}" export HYDRA_SFT_SEQ_LEN="${HYDRA_SFT_SEQ_LEN:-512}" export HYDRA_SFT_LR_MULT="${HYDRA_SFT_LR_MULT:-0.10}" export HYDRA_SFT_EVAL_INTERVAL="${HYDRA_SFT_EVAL_INTERVAL:-500}" export HYDRA_SFT_CKPT_INTERVAL="${HYDRA_SFT_CKPT_INTERVAL:-2000}" export HYDRA_DROPOUT="${HYDRA_DROPOUT:-0.1}" nohup "$PY" -u "$REPO_ROOT/scripts/sft.py" \ > "$LOG_FILE" 2>&1 & SFT_PID=$! disown $SFT_PID 2>/dev/null || true log "SFT launched as PID $SFT_PID (budget=${HYDRA_SFT_TIME_BUDGET}s)" log "monitor with: tail -f $LOG_FILE"