Spaces:
Runtime error
Runtime error
| # | |
| # SFT orchestrator: waits for pretrain (train.py) to either complete or | |
| # reach the 8h budget, then kicks off SFT. | |
| # | |
| # Behavior: | |
| # - Polls for `train.py` process every 60 s | |
| # - Exits the wait loop on either: | |
| # (a) no train.py process found (pretrain completed naturally), or | |
| # (b) 8h elapsed since this script started | |
| # - Sends SIGTERM first (graceful β triggers checkpoint-save patch if | |
| # applied), waits 30 s, then SIGKILL as fallback | |
| # - Invokes `scripts/download_sft_data.py` if shards don't exist | |
| # - Launches `scripts/sft.py` in the background with tuned env vars | |
| # - Redirects all output to `run_sft.log` | |
| # | |
| # Re-entrant: safe to invoke even if pretrain has already exited. | |
| # Does NOT re-launch if SFT is already running. | |
| # | |
| # Usage (typical): | |
| # cd /home/mikeb/work/feather | |
| # nohup bash scripts/sft_orchestrator.sh > orchestrator.log 2>&1 & | |
| # disown | |
| set -u # error on unset vars, but don't -e (we handle failures explicitly) | |
| REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)" | |
| cd "$REPO_ROOT" || { echo "cannot cd to $REPO_ROOT" >&2; exit 1; } | |
| PY="$REPO_ROOT/.venv/bin/python" | |
| if [ ! -x "$PY" ]; then | |
| echo "[orchestrator] ERROR: python not found at $PY" >&2 | |
| exit 1 | |
| fi | |
| LOG_FILE="$REPO_ROOT/run_sft.log" | |
| DATA_LOG="$REPO_ROOT/run_sft_download.log" | |
| MAX_WAIT_SECONDS=28800 # 8 hours | |
| POLL_INTERVAL=60 | |
| GRACEFUL_SHUTDOWN_WAIT=30 | |
| log() { | |
| echo "[orchestrator $(date -u '+%Y-%m-%dT%H:%M:%SZ')] $*" | |
| } | |
| # --------------------------------------------------------------------------- | |
| # Stage 1: wait for pretrain | |
| # --------------------------------------------------------------------------- | |
| log "starting; max wait = ${MAX_WAIT_SECONDS}s" | |
| # Guard against double-launch | |
| if pgrep -f "scripts/sft.py" > /dev/null; then | |
| log "SFT is already running β exiting orchestrator to avoid conflict" | |
| exit 0 | |
| fi | |
| T_START=$(date +%s) | |
| while true; do | |
| NOW=$(date +%s) | |
| ELAPSED=$((NOW - T_START)) | |
| if [ $ELAPSED -ge $MAX_WAIT_SECONDS ]; then | |
| log "reached 8h wait cap (${ELAPSED}s) β will kill pretrain" | |
| break | |
| fi | |
| # Count train.py processes owned by current user (not orchestrator/sft.py) | |
| PRETRAIN_PIDS=$(pgrep -u "$USER" -f "train\.py" 2>/dev/null | tr '\n' ' ') | |
| # Strip pid of this script if pgrep matched something spurious | |
| PRETRAIN_PIDS=$(echo "$PRETRAIN_PIDS" | sed "s/\b$$\b//g" | xargs) | |
| if [ -z "$PRETRAIN_PIDS" ]; then | |
| log "no train.py process found β pretrain already exited" | |
| break | |
| fi | |
| # Log a status every 10 polls (~10 min) | |
| if [ $((ELAPSED / POLL_INTERVAL % 10)) -eq 0 ]; then | |
| log "waiting... elapsed=${ELAPSED}s pretrain PIDs: $PRETRAIN_PIDS" | |
| fi | |
| sleep $POLL_INTERVAL | |
| done | |
| # --------------------------------------------------------------------------- | |
| # Stage 2: kill any remaining pretrain processes | |
| # --------------------------------------------------------------------------- | |
| PRETRAIN_PIDS=$(pgrep -u "$USER" -f "train\.py" 2>/dev/null | tr '\n' ' ') | |
| if [ -n "$PRETRAIN_PIDS" ]; then | |
| log "sending SIGTERM to pretrain PIDs: $PRETRAIN_PIDS" | |
| for pid in $PRETRAIN_PIDS; do | |
| kill -TERM "$pid" 2>/dev/null || true | |
| done | |
| # Wait for graceful shutdown (gives the checkpoint-save patch time to run) | |
| for _ in $(seq 1 $GRACEFUL_SHUTDOWN_WAIT); do | |
| REMAINING=$(pgrep -u "$USER" -f "train\.py" 2>/dev/null | tr '\n' ' ') | |
| if [ -z "$REMAINING" ]; then break; fi | |
| sleep 1 | |
| done | |
| # Force-kill any stragglers | |
| REMAINING=$(pgrep -u "$USER" -f "train\.py" 2>/dev/null | tr '\n' ' ') | |
| if [ -n "$REMAINING" ]; then | |
| log "force-killing stragglers: $REMAINING" | |
| for pid in $REMAINING; do | |
| kill -9 "$pid" 2>/dev/null || true | |
| done | |
| sleep 5 | |
| fi | |
| fi | |
| # --------------------------------------------------------------------------- | |
| # Stage 3: ensure SFT data exists | |
| # --------------------------------------------------------------------------- | |
| META_JSON="$REPO_ROOT/data/sft/meta.json" | |
| if [ ! -f "$META_JSON" ]; then | |
| log "no SFT data found β running download_sft_data.py" | |
| LD_LIBRARY_PATH=/usr/lib/wsl/lib:/usr/local/cuda/lib64 \ | |
| "$PY" -u "$REPO_ROOT/scripts/download_sft_data.py" \ | |
| > "$DATA_LOG" 2>&1 | |
| DL_RC=$? | |
| if [ $DL_RC -ne 0 ] || [ ! -f "$META_JSON" ]; then | |
| log "ERROR: SFT data download failed (rc=$DL_RC)" | |
| log " last 20 lines of $DATA_LOG:" | |
| tail -20 "$DATA_LOG" 2>/dev/null | sed 's/^/ /' | |
| exit 2 | |
| fi | |
| log "SFT data ready" | |
| else | |
| log "SFT data already present at $META_JSON" | |
| fi | |
| # --------------------------------------------------------------------------- | |
| # Stage 4: launch SFT | |
| # --------------------------------------------------------------------------- | |
| # Guard: if we somehow got here and SFT is now running, don't double-launch. | |
| if pgrep -f "scripts/sft.py" > /dev/null; then | |
| log "SFT is already running β skipping launch" | |
| exit 0 | |
| fi | |
| log "launching SFT (log -> $LOG_FILE)" | |
| export LD_LIBRARY_PATH="/usr/lib/wsl/lib:/usr/local/cuda/lib64${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}" | |
| export HYDRA_SFT_TIME_BUDGET="${HYDRA_SFT_TIME_BUDGET:-10800}" | |
| export HYDRA_BATCH_SIZE="${HYDRA_BATCH_SIZE:-4}" | |
| export HYDRA_TOTAL_BATCH="${HYDRA_TOTAL_BATCH:-8192}" | |
| export HYDRA_SFT_SEQ_LEN="${HYDRA_SFT_SEQ_LEN:-512}" | |
| export HYDRA_SFT_LR_MULT="${HYDRA_SFT_LR_MULT:-0.10}" | |
| export HYDRA_SFT_EVAL_INTERVAL="${HYDRA_SFT_EVAL_INTERVAL:-500}" | |
| export HYDRA_SFT_CKPT_INTERVAL="${HYDRA_SFT_CKPT_INTERVAL:-2000}" | |
| export HYDRA_DROPOUT="${HYDRA_DROPOUT:-0.1}" | |
| nohup "$PY" -u "$REPO_ROOT/scripts/sft.py" \ | |
| > "$LOG_FILE" 2>&1 & | |
| SFT_PID=$! | |
| disown $SFT_PID 2>/dev/null || true | |
| log "SFT launched as PID $SFT_PID (budget=${HYDRA_SFT_TIME_BUDGET}s)" | |
| log "monitor with: tail -f $LOG_FILE" | |