#!/usr/bin/env bash
#
# SFT orchestrator: waits for pretrain (train.py) to either complete or
# reach the 8h budget, then kicks off SFT.
#
# Behavior:
#   - Polls for `train.py` process every 60 s
#   - Exits the wait loop on either:
#       (a) no train.py process found (pretrain completed naturally), or
#       (b) 8h elapsed since this script started
#   - Sends SIGTERM first (graceful — triggers checkpoint-save patch if
#     applied), waits 30 s, then SIGKILL as fallback
#   - Invokes `scripts/download_sft_data.py` if shards don't exist
#   - Launches `scripts/sft.py` in the background with tuned env vars
#   - Redirects all output to `run_sft.log`
#
# Re-entrant: safe to invoke even if pretrain has already exited.
# Does NOT re-launch if SFT is already running.
#
# Usage (typical):
#   cd /home/mikeb/work/feather
#   nohup bash scripts/sft_orchestrator.sh > orchestrator.log 2>&1 &
#   disown

set -u  # error on unset vars, but don't -e (we handle failures explicitly)

REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/.." && pwd)"
cd "$REPO_ROOT" || { echo "cannot cd to $REPO_ROOT" >&2; exit 1; }

PY="$REPO_ROOT/.venv/bin/python"
if [ ! -x "$PY" ]; then
    echo "[orchestrator] ERROR: python not found at $PY" >&2
    exit 1
fi

LOG_FILE="$REPO_ROOT/run_sft.log"
DATA_LOG="$REPO_ROOT/run_sft_download.log"
MAX_WAIT_SECONDS=28800   # 8 hours
POLL_INTERVAL=60
GRACEFUL_SHUTDOWN_WAIT=30

log() {
    echo "[orchestrator $(date -u '+%Y-%m-%dT%H:%M:%SZ')] $*"
}

# ---------------------------------------------------------------------------
# Stage 1: wait for pretrain
# ---------------------------------------------------------------------------

log "starting; max wait = ${MAX_WAIT_SECONDS}s"

# Guard against double-launch
if pgrep -f "scripts/sft.py" > /dev/null; then
    log "SFT is already running — exiting orchestrator to avoid conflict"
    exit 0
fi

T_START=$(date +%s)
while true; do
    NOW=$(date +%s)
    ELAPSED=$((NOW - T_START))

    if [ $ELAPSED -ge $MAX_WAIT_SECONDS ]; then
        log "reached 8h wait cap (${ELAPSED}s) — will kill pretrain"
        break
    fi

    # Count train.py processes owned by current user (not orchestrator/sft.py)
    PRETRAIN_PIDS=$(pgrep -u "$USER" -f "train\.py" 2>/dev/null | tr '\n' ' ')
    # Strip pid of this script if pgrep matched something spurious
    PRETRAIN_PIDS=$(echo "$PRETRAIN_PIDS" | sed "s/\b$$\b//g" | xargs)

    if [ -z "$PRETRAIN_PIDS" ]; then
        log "no train.py process found — pretrain already exited"
        break
    fi

    # Log a status every 10 polls (~10 min)
    if [ $((ELAPSED / POLL_INTERVAL % 10)) -eq 0 ]; then
        log "waiting... elapsed=${ELAPSED}s  pretrain PIDs: $PRETRAIN_PIDS"
    fi

    sleep $POLL_INTERVAL
done

# ---------------------------------------------------------------------------
# Stage 2: kill any remaining pretrain processes
# ---------------------------------------------------------------------------

PRETRAIN_PIDS=$(pgrep -u "$USER" -f "train\.py" 2>/dev/null | tr '\n' ' ')
if [ -n "$PRETRAIN_PIDS" ]; then
    log "sending SIGTERM to pretrain PIDs: $PRETRAIN_PIDS"
    for pid in $PRETRAIN_PIDS; do
        kill -TERM "$pid" 2>/dev/null || true
    done

    # Wait for graceful shutdown (gives the checkpoint-save patch time to run)
    for _ in $(seq 1 $GRACEFUL_SHUTDOWN_WAIT); do
        REMAINING=$(pgrep -u "$USER" -f "train\.py" 2>/dev/null | tr '\n' ' ')
        if [ -z "$REMAINING" ]; then break; fi
        sleep 1
    done

    # Force-kill any stragglers
    REMAINING=$(pgrep -u "$USER" -f "train\.py" 2>/dev/null | tr '\n' ' ')
    if [ -n "$REMAINING" ]; then
        log "force-killing stragglers: $REMAINING"
        for pid in $REMAINING; do
            kill -9 "$pid" 2>/dev/null || true
        done
        sleep 5
    fi
fi

# ---------------------------------------------------------------------------
# Stage 3: ensure SFT data exists
# ---------------------------------------------------------------------------

META_JSON="$REPO_ROOT/data/sft/meta.json"
if [ ! -f "$META_JSON" ]; then
    log "no SFT data found — running download_sft_data.py"
    LD_LIBRARY_PATH=/usr/lib/wsl/lib:/usr/local/cuda/lib64 \
        "$PY" -u "$REPO_ROOT/scripts/download_sft_data.py" \
        > "$DATA_LOG" 2>&1
    DL_RC=$?
    if [ $DL_RC -ne 0 ] || [ ! -f "$META_JSON" ]; then
        log "ERROR: SFT data download failed (rc=$DL_RC)"
        log "  last 20 lines of $DATA_LOG:"
        tail -20 "$DATA_LOG" 2>/dev/null | sed 's/^/    /'
        exit 2
    fi
    log "SFT data ready"
else
    log "SFT data already present at $META_JSON"
fi

# ---------------------------------------------------------------------------
# Stage 4: launch SFT
# ---------------------------------------------------------------------------

# Guard: if we somehow got here and SFT is now running, don't double-launch.
if pgrep -f "scripts/sft.py" > /dev/null; then
    log "SFT is already running — skipping launch"
    exit 0
fi

log "launching SFT (log -> $LOG_FILE)"

export LD_LIBRARY_PATH="/usr/lib/wsl/lib:/usr/local/cuda/lib64${LD_LIBRARY_PATH:+:$LD_LIBRARY_PATH}"
export HYDRA_SFT_TIME_BUDGET="${HYDRA_SFT_TIME_BUDGET:-10800}"
export HYDRA_BATCH_SIZE="${HYDRA_BATCH_SIZE:-4}"
export HYDRA_TOTAL_BATCH="${HYDRA_TOTAL_BATCH:-8192}"
export HYDRA_SFT_SEQ_LEN="${HYDRA_SFT_SEQ_LEN:-512}"
export HYDRA_SFT_LR_MULT="${HYDRA_SFT_LR_MULT:-0.10}"
export HYDRA_SFT_EVAL_INTERVAL="${HYDRA_SFT_EVAL_INTERVAL:-500}"
export HYDRA_SFT_CKPT_INTERVAL="${HYDRA_SFT_CKPT_INTERVAL:-2000}"
export HYDRA_DROPOUT="${HYDRA_DROPOUT:-0.1}"

nohup "$PY" -u "$REPO_ROOT/scripts/sft.py" \
    > "$LOG_FILE" 2>&1 &
SFT_PID=$!
disown $SFT_PID 2>/dev/null || true

log "SFT launched as PID $SFT_PID (budget=${HYDRA_SFT_TIME_BUDGET}s)"
log "monitor with: tail -f $LOG_FILE"