Spaces:

Jackoatmon
/

feather-runtime

Runtime error

App Files Files Community

feather-runtime / overlay /scripts /sft_orchestrator.sh

Jackoatmon

Update Feather H200 runtime: Nemotron streaming and HTM force-CPU canary fixes

c2bf4b6 verified 29 days ago

raw

history blame contribute delete

5.81 kB

	#!/usr/bin/env bash
	#
	# SFT orchestrator: waits for pretrain (train.py) to either complete or
	# reach the 8h budget, then kicks off SFT.
	#
	# Behavior:
	# - Polls for `train.py` process every 60 s
	# - Exits the wait loop on either:
	# (a) no train.py process found (pretrain completed naturally), or
	# (b) 8h elapsed since this script started
	# - Sends SIGTERM first (graceful — triggers checkpoint-save patch if
	# applied), waits 30 s, then SIGKILL as fallback
	# - Invokes `scripts/download_sft_data.py` if shards don't exist
	# - Launches `scripts/sft.py` in the background with tuned env vars
	# - Redirects all output to `run_sft.log`
	#
	# Re-entrant: safe to invoke even if pretrain has already exited.
	# Does NOT re-launch if SFT is already running.
	#
	# Usage (typical):
	# cd /home/mikeb/work/feather
	# nohup bash scripts/sft_orchestrator.sh > orchestrator.log 2>&1 &
	# disown

	set -u # error on unset vars, but don't -e (we handle failures explicitly)

	REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
	cd "$REPO_ROOT" \|\| { echo "cannot cd to $REPO_ROOT" >&2; exit 1; }

	PY="$REPO_ROOT/.venv/bin/python"
	if [ ! -x "$PY" ]; then
	echo "[orchestrator] ERROR: python not found at $PY" >&2
	exit 1
	fi

	LOG_FILE="$REPO_ROOT/run_sft.log"
	DATA_LOG="$REPO_ROOT/run_sft_download.log"
	MAX_WAIT_SECONDS=28800 # 8 hours
	POLL_INTERVAL=60
	GRACEFUL_SHUTDOWN_WAIT=30

	log() {
	echo "[orchestrator $(date -u '+%Y-%m-%dT%H:%M:%SZ')] $*"
	}

	# ---------------------------------------------------------------------------
	# Stage 1: wait for pretrain
	# ---------------------------------------------------------------------------

	log "starting; max wait = ${MAX_WAIT_SECONDS}s"

	# Guard against double-launch
	if pgrep -f "scripts/sft.py" > /dev/null; then
	log "SFT is already running — exiting orchestrator to avoid conflict"
	exit 0
	fi

	T_START=$(date +%s)
	while true; do
	NOW=$(date +%s)
	ELAPSED=$((NOW - T_START))

	if [ $ELAPSED -ge $MAX_WAIT_SECONDS ]; then
	log "reached 8h wait cap (${ELAPSED}s) — will kill pretrain"
	break
	fi

	# Count train.py processes owned by current user (not orchestrator/sft.py)
	PRETRAIN_PIDS=$(pgrep -u "$USER" -f "train\.py" 2>/dev/null \| tr '\n' ' ')
	# Strip pid of this script if pgrep matched something spurious
	PRETRAIN_PIDS=$(echo "$PRETRAIN_PIDS" \| sed "s/\b$$\b//g" \| xargs)

	if [ -z "$PRETRAIN_PIDS" ]; then
	log "no train.py process found — pretrain already exited"
	break
	fi

	# Log a status every 10 polls (~10 min)
	if [ $((ELAPSED / POLL_INTERVAL % 10)) -eq 0 ]; then
	log "waiting... elapsed=${ELAPSED}s pretrain PIDs: $PRETRAIN_PIDS"
	fi

	sleep $POLL_INTERVAL
	done

	# ---------------------------------------------------------------------------
	# Stage 2: kill any remaining pretrain processes
	# ---------------------------------------------------------------------------

	PRETRAIN_PIDS=$(pgrep -u "$USER" -f "train\.py" 2>/dev/null \| tr '\n' ' ')
	if [ -n "$PRETRAIN_PIDS" ]; then
	log "sending SIGTERM to pretrain PIDs: $PRETRAIN_PIDS"
	for pid in $PRETRAIN_PIDS; do
	kill -TERM "$pid" 2>/dev/null \|\| true
	done

	# Wait for graceful shutdown (gives the checkpoint-save patch time to run)
	for _ in $(seq 1 $GRACEFUL_SHUTDOWN_WAIT); do
	REMAINING=$(pgrep -u "$USER" -f "train\.py" 2>/dev/null \| tr '\n' ' ')
	if [ -z "$REMAINING" ]; then break; fi
	sleep 1
	done

	# Force-kill any stragglers
	REMAINING=$(pgrep -u "$USER" -f "train\.py" 2>/dev/null \| tr '\n' ' ')
	if [ -n "$REMAINING" ]; then
	log "force-killing stragglers: $REMAINING"
	for pid in $REMAINING; do
	kill -9 "$pid" 2>/dev/null \|\| true
	done
	sleep 5
	fi
	fi

	# ---------------------------------------------------------------------------
	# Stage 3: ensure SFT data exists
	# ---------------------------------------------------------------------------

	META_JSON="$REPO_ROOT/data/sft/meta.json"
	if [ ! -f "$META_JSON" ]; then
	log "no SFT data found — running download_sft_data.py"
	LD_LIBRARY_PATH=/usr/lib/wsl/lib:/usr/local/cuda/lib64 \
	"$PY" -u "$REPO_ROOT/scripts/download_sft_data.py" \
	> "$DATA_LOG" 2>&1
	DL_RC=$?
	if [ $DL_RC -ne 0 ] \|\| [ ! -f "$META_JSON" ]; then
	log "ERROR: SFT data download failed (rc=$DL_RC)"
	log " last 20 lines of $DATA_LOG:"
	tail -20 "$DATA_LOG" 2>/dev/null \| sed 's/^/ /'
	exit 2
	fi
	log "SFT data ready"
	else
	log "SFT data already present at $META_JSON"
	fi

	# ---------------------------------------------------------------------------
	# Stage 4: launch SFT
	# ---------------------------------------------------------------------------

	# Guard: if we somehow got here and SFT is now running, don't double-launch.
	if pgrep -f "scripts/sft.py" > /dev/null; then
	log "SFT is already running — skipping launch"
	exit 0
	fi

	log "launching SFT (log -> $LOG_FILE)"

	export LD_LIBRARY_PATH="/usr/lib/wsl/lib:/usr/local/cuda/lib64${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}"
	export HYDRA_SFT_TIME_BUDGET="${HYDRA_SFT_TIME_BUDGET:-10800}"
	export HYDRA_BATCH_SIZE="${HYDRA_BATCH_SIZE:-4}"
	export HYDRA_TOTAL_BATCH="${HYDRA_TOTAL_BATCH:-8192}"
	export HYDRA_SFT_SEQ_LEN="${HYDRA_SFT_SEQ_LEN:-512}"
	export HYDRA_SFT_LR_MULT="${HYDRA_SFT_LR_MULT:-0.10}"
	export HYDRA_SFT_EVAL_INTERVAL="${HYDRA_SFT_EVAL_INTERVAL:-500}"
	export HYDRA_SFT_CKPT_INTERVAL="${HYDRA_SFT_CKPT_INTERVAL:-2000}"
	export HYDRA_DROPOUT="${HYDRA_DROPOUT:-0.1}"

	nohup "$PY" -u "$REPO_ROOT/scripts/sft.py" \
	> "$LOG_FILE" 2>&1 &
	SFT_PID=$!
	disown $SFT_PID 2>/dev/null \|\| true

	log "SFT launched as PID $SFT_PID (budget=${HYDRA_SFT_TIME_BUDGET}s)"
	log "monitor with: tail -f $LOG_FILE"