feather-runtime / overlay /scripts /sft_orchestrator.sh
Jackoatmon's picture
Update Feather H200 runtime: Nemotron streaming and HTM force-CPU canary fixes
c2bf4b6 verified
#!/usr/bin/env bash
#
# SFT orchestrator: waits for pretrain (train.py) to either complete or
# reach the 8h budget, then kicks off SFT.
#
# Behavior:
# - Polls for `train.py` process every 60 s
# - Exits the wait loop on either:
# (a) no train.py process found (pretrain completed naturally), or
# (b) 8h elapsed since this script started
# - Sends SIGTERM first (graceful β€” triggers checkpoint-save patch if
# applied), waits 30 s, then SIGKILL as fallback
# - Invokes `scripts/download_sft_data.py` if shards don't exist
# - Launches `scripts/sft.py` in the background with tuned env vars
# - Redirects all output to `run_sft.log`
#
# Re-entrant: safe to invoke even if pretrain has already exited.
# Does NOT re-launch if SFT is already running.
#
# Usage (typical):
# cd /home/mikeb/work/feather
# nohup bash scripts/sft_orchestrator.sh > orchestrator.log 2>&1 &
# disown
set -u # error on unset vars, but don't -e (we handle failures explicitly)
REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
cd "$REPO_ROOT" || { echo "cannot cd to $REPO_ROOT" >&2; exit 1; }
PY="$REPO_ROOT/.venv/bin/python"
if [ ! -x "$PY" ]; then
echo "[orchestrator] ERROR: python not found at $PY" >&2
exit 1
fi
LOG_FILE="$REPO_ROOT/run_sft.log"
DATA_LOG="$REPO_ROOT/run_sft_download.log"
MAX_WAIT_SECONDS=28800 # 8 hours
POLL_INTERVAL=60
GRACEFUL_SHUTDOWN_WAIT=30
log() {
echo "[orchestrator $(date -u '+%Y-%m-%dT%H:%M:%SZ')] $*"
}
# ---------------------------------------------------------------------------
# Stage 1: wait for pretrain
# ---------------------------------------------------------------------------
log "starting; max wait = ${MAX_WAIT_SECONDS}s"
# Guard against double-launch
if pgrep -f "scripts/sft.py" > /dev/null; then
log "SFT is already running β€” exiting orchestrator to avoid conflict"
exit 0
fi
T_START=$(date +%s)
while true; do
NOW=$(date +%s)
ELAPSED=$((NOW - T_START))
if [ $ELAPSED -ge $MAX_WAIT_SECONDS ]; then
log "reached 8h wait cap (${ELAPSED}s) β€” will kill pretrain"
break
fi
# Count train.py processes owned by current user (not orchestrator/sft.py)
PRETRAIN_PIDS=$(pgrep -u "$USER" -f "train\.py" 2>/dev/null | tr '\n' ' ')
# Strip pid of this script if pgrep matched something spurious
PRETRAIN_PIDS=$(echo "$PRETRAIN_PIDS" | sed "s/\b$$\b//g" | xargs)
if [ -z "$PRETRAIN_PIDS" ]; then
log "no train.py process found β€” pretrain already exited"
break
fi
# Log a status every 10 polls (~10 min)
if [ $((ELAPSED / POLL_INTERVAL % 10)) -eq 0 ]; then
log "waiting... elapsed=${ELAPSED}s pretrain PIDs: $PRETRAIN_PIDS"
fi
sleep $POLL_INTERVAL
done
# ---------------------------------------------------------------------------
# Stage 2: kill any remaining pretrain processes
# ---------------------------------------------------------------------------
PRETRAIN_PIDS=$(pgrep -u "$USER" -f "train\.py" 2>/dev/null | tr '\n' ' ')
if [ -n "$PRETRAIN_PIDS" ]; then
log "sending SIGTERM to pretrain PIDs: $PRETRAIN_PIDS"
for pid in $PRETRAIN_PIDS; do
kill -TERM "$pid" 2>/dev/null || true
done
# Wait for graceful shutdown (gives the checkpoint-save patch time to run)
for _ in $(seq 1 $GRACEFUL_SHUTDOWN_WAIT); do
REMAINING=$(pgrep -u "$USER" -f "train\.py" 2>/dev/null | tr '\n' ' ')
if [ -z "$REMAINING" ]; then break; fi
sleep 1
done
# Force-kill any stragglers
REMAINING=$(pgrep -u "$USER" -f "train\.py" 2>/dev/null | tr '\n' ' ')
if [ -n "$REMAINING" ]; then
log "force-killing stragglers: $REMAINING"
for pid in $REMAINING; do
kill -9 "$pid" 2>/dev/null || true
done
sleep 5
fi
fi
# ---------------------------------------------------------------------------
# Stage 3: ensure SFT data exists
# ---------------------------------------------------------------------------
META_JSON="$REPO_ROOT/data/sft/meta.json"
if [ ! -f "$META_JSON" ]; then
log "no SFT data found β€” running download_sft_data.py"
LD_LIBRARY_PATH=/usr/lib/wsl/lib:/usr/local/cuda/lib64 \
"$PY" -u "$REPO_ROOT/scripts/download_sft_data.py" \
> "$DATA_LOG" 2>&1
DL_RC=$?
if [ $DL_RC -ne 0 ] || [ ! -f "$META_JSON" ]; then
log "ERROR: SFT data download failed (rc=$DL_RC)"
log " last 20 lines of $DATA_LOG:"
tail -20 "$DATA_LOG" 2>/dev/null | sed 's/^/ /'
exit 2
fi
log "SFT data ready"
else
log "SFT data already present at $META_JSON"
fi
# ---------------------------------------------------------------------------
# Stage 4: launch SFT
# ---------------------------------------------------------------------------
# Guard: if we somehow got here and SFT is now running, don't double-launch.
if pgrep -f "scripts/sft.py" > /dev/null; then
log "SFT is already running β€” skipping launch"
exit 0
fi
log "launching SFT (log -> $LOG_FILE)"
export LD_LIBRARY_PATH="/usr/lib/wsl/lib:/usr/local/cuda/lib64${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}"
export HYDRA_SFT_TIME_BUDGET="${HYDRA_SFT_TIME_BUDGET:-10800}"
export HYDRA_BATCH_SIZE="${HYDRA_BATCH_SIZE:-4}"
export HYDRA_TOTAL_BATCH="${HYDRA_TOTAL_BATCH:-8192}"
export HYDRA_SFT_SEQ_LEN="${HYDRA_SFT_SEQ_LEN:-512}"
export HYDRA_SFT_LR_MULT="${HYDRA_SFT_LR_MULT:-0.10}"
export HYDRA_SFT_EVAL_INTERVAL="${HYDRA_SFT_EVAL_INTERVAL:-500}"
export HYDRA_SFT_CKPT_INTERVAL="${HYDRA_SFT_CKPT_INTERVAL:-2000}"
export HYDRA_DROPOUT="${HYDRA_DROPOUT:-0.1}"
nohup "$PY" -u "$REPO_ROOT/scripts/sft.py" \
> "$LOG_FILE" 2>&1 &
SFT_PID=$!
disown $SFT_PID 2>/dev/null || true
log "SFT launched as PID $SFT_PID (budget=${HYDRA_SFT_TIME_BUDGET}s)"
log "monitor with: tail -f $LOG_FILE"