| | #!/usr/bin/env bash |
| | |
| | |
| | |
| | |
| | |
| | set -euo pipefail |
| |
|
| | |
| | WORKDIR="${WORKDIR:-$(cd "$(dirname "$0")/.." && pwd)}" |
| | CKPT_DIR="$WORKDIR/checkpoints/korean_3b_fp8_run1" |
| | LOG_FILE="$CKPT_DIR/train.log" |
| | PID_FILE="$CKPT_DIR/train.pid" |
| | WATCHDOG_LOG="$CKPT_DIR/watchdog.log" |
| | STATE_FILE="$CKPT_DIR/watchdog.state" |
| | NOTIFY="python3 $WORKDIR/scripts/telegram_notify.py" |
| |
|
| | |
| | LOSS_SPIKE_THRESHOLD="5.0" |
| | LOSS_NAN_PATTERN="nan|inf|NaN|Inf" |
| | STALL_SECONDS=900 |
| | DISK_WARN_PCT=85 |
| | GPU_UTIL_WARN_PCT=20 |
| | MIN_TOKPS=5000 |
| | TOTAL_STEPS=57000 |
| | WAIT_COUNT_FILE="/tmp/frankenstallm-wait-count" |
| | MAX_WAIT_COUNT=10 |
| |
|
| | |
| | ts() { date '+%Y-%m-%d %H:%M:%S'; } |
| |
|
| | log_msg() { |
| | echo "[$(ts)] $*" |
| | } |
| |
|
| | send_alert() { |
| | local level="$1" |
| | local msg="$2" |
| | log_msg "ALERT[$level]: $msg" |
| | $NOTIFY "<b>[FRANKENSTALLM ALERT] $level</b> |
| | |
| | $msg |
| | |
| | <i>$(ts) | watchdog check</i>" || true |
| | } |
| |
|
| | |
| | check_process() { |
| | if [[ ! -f "$PID_FILE" ]]; then |
| | |
| | local wait_count=0 |
| | [[ -f "$WAIT_COUNT_FILE" ]] && wait_count=$(cat "$WAIT_COUNT_FILE" 2>/dev/null || echo 0) |
| | wait_count=$(( wait_count + 1 )) |
| | echo "$wait_count" > "$WAIT_COUNT_FILE" |
| | log_msg "Training not started yet (waiting ${wait_count}/${MAX_WAIT_COUNT})." |
| |
|
| | if (( wait_count > MAX_WAIT_COUNT )); then |
| | send_alert "WAIT_TIMEOUT" "νμ΅μ΄ <b>${wait_count}ν</b> μ²΄ν¬ λμ μμλμ§ μμμ΅λλ€ (~$((wait_count * 10))λΆ). |
| | |
| | PID νμΌ μμ: <code>$PID_FILE</code> |
| | |
| | Watchdog cronμ μλ ν΄μ ν©λλ€. νμ΅ μμ ν μ§μ μ¬λ±λ‘νμΈμ: |
| | <code>crontab -e</code>" |
| | |
| | crontab -l 2>/dev/null | grep -v "training_watchdog" | crontab - |
| | rm -f "$WAIT_COUNT_FILE" |
| | log_msg "Watchdog cron entry removed after ${wait_count} waits." |
| | fi |
| | return 1 |
| | fi |
| | |
| | rm -f "$WAIT_COUNT_FILE" |
| |
|
| | local pid |
| | pid=$(cat "$PID_FILE" 2>/dev/null | tr -d '[:space:]') |
| |
|
| | if [[ -z "$pid" ]]; then |
| | send_alert "PROCESS" "PID file is empty: $PID_FILE" |
| | return 1 |
| | fi |
| |
|
| | if ! kill -0 "$pid" 2>/dev/null; then |
| | |
| | local last_step |
| | last_step=$(grep -oP 'step\s+\K[0-9]+' "$LOG_FILE" 2>/dev/null | tail -1) |
| | if [[ "$last_step" == "$TOTAL_STEPS" ]]; then |
| | log_msg "Training COMPLETED at step $TOTAL_STEPS β process exit is expected." |
| | send_alert "COMPLETE" "Training completed normally at step <code>$TOTAL_STEPS/$TOTAL_STEPS</code>." |
| | else |
| | send_alert "CRASH" "Training process (PID $pid) is NOT running. |
| | Last logged step: <code>${last_step:-unknown}</code>/$TOTAL_STEPS |
| | |
| | Check log: <code>tail -50 $LOG_FILE</code>" |
| | fi |
| | return 1 |
| | fi |
| |
|
| | log_msg "Process PID $pid is alive." |
| | return 0 |
| | } |
| |
|
| | |
| | check_stall() { |
| | if [[ ! -f "$LOG_FILE" ]]; then |
| | send_alert "STALL" "Log file not found: $LOG_FILE" |
| | return 1 |
| | fi |
| |
|
| | local log_mtime now elapsed |
| | log_mtime=$(stat -c '%Y' "$LOG_FILE" 2>/dev/null || echo 0) |
| | now=$(date +%s) |
| | elapsed=$(( now - log_mtime )) |
| |
|
| | if (( elapsed >= STALL_SECONDS )); then |
| | local mins=$(( elapsed / 60 )) |
| | send_alert "STALL" "No log activity for <b>${mins} minutes</b> (threshold: $(( STALL_SECONDS/60 ))min). |
| | Log last modified: <code>$(date -d "@$log_mtime" '+%Y-%m-%d %H:%M:%S')</code> |
| | Training may be hung or extremely slow." |
| | return 1 |
| | fi |
| |
|
| | log_msg "Log freshness OK: last update ${elapsed}s ago." |
| | return 0 |
| | } |
| |
|
| | |
| | check_loss() { |
| | if [[ ! -f "$LOG_FILE" ]]; then |
| | return 0 |
| | fi |
| |
|
| | |
| | local last_line |
| | last_line=$(grep -E 'step\s+[0-9]+.*loss' "$LOG_FILE" 2>/dev/null | tail -1) |
| |
|
| | if [[ -z "$last_line" ]]; then |
| | log_msg "No step lines found in log yet." |
| | return 0 |
| | fi |
| |
|
| | local loss step |
| | loss=$(echo "$last_line" | grep -oP 'loss\s+\K[0-9.eE+\-naifNIF]+' || echo "") |
| | step=$(echo "$last_line" | grep -oP 'step\s+\K[0-9]+' || echo "0") |
| |
|
| | if [[ -z "$loss" ]]; then |
| | log_msg "Could not parse loss from: $last_line" |
| | return 0 |
| | fi |
| |
|
| | |
| | if echo "$loss" | grep -qiE "$LOSS_NAN_PATTERN"; then |
| | send_alert "LOSS_NAN" "Loss is <b>$loss</b> at step <code>$step</code>. |
| | Training has diverged β NaN/Inf detected. |
| | |
| | Last log line: |
| | <code>${last_line}</code>" |
| | return 1 |
| | fi |
| |
|
| | |
| | if (( step > 500 )); then |
| | local loss_int |
| | loss_int=$(echo "$loss >= $LOSS_SPIKE_THRESHOLD" | bc -l 2>/dev/null || echo 0) |
| | if [[ "$loss_int" == "1" ]]; then |
| | send_alert "LOSS_SPIKE" "Loss spike detected: <b>$loss</b> at step <code>$step</code> (threshold: $LOSS_SPIKE_THRESHOLD). |
| | |
| | Last log line: |
| | <code>${last_line}</code>" |
| | return 1 |
| | fi |
| | fi |
| |
|
| | log_msg "Loss OK: $loss at step $step." |
| | return 0 |
| | } |
| |
|
| | |
| | check_throughput() { |
| | if [[ ! -f "$LOG_FILE" ]]; then |
| | return 0 |
| | fi |
| |
|
| | local last_line |
| | last_line=$(grep -E 'step\s+[0-9]+.*tok/s' "$LOG_FILE" 2>/dev/null | tail -1) |
| | [[ -z "$last_line" ]] && return 0 |
| |
|
| | |
| | local tokps step |
| | tokps=$(echo "$last_line" | grep -oP 'tok/s\s+\K[\d,]+' | tr -d ',' || echo "") |
| | step=$(echo "$last_line" | grep -oP 'step\s+\K[0-9]+' || echo "0") |
| |
|
| | if [[ -z "$tokps" ]]; then |
| | log_msg "Could not parse tok/s from last log line." |
| | return 0 |
| | fi |
| |
|
| | if (( step > 100 && tokps < MIN_TOKPS )); then |
| | send_alert "THROUGHPUT" "Throughput dropped to <b>${tokps} tok/s</b> at step <code>$step</code> (min: ${MIN_TOKPS}). |
| | GPU may be throttling, NCCL stalled, or a data worker is slow." |
| | return 1 |
| | fi |
| |
|
| | log_msg "Throughput OK: ${tokps} tok/s at step $step." |
| | return 0 |
| | } |
| |
|
| | |
| | check_gpu() { |
| | if ! command -v nvidia-smi &>/dev/null; then |
| | log_msg "nvidia-smi not available β skipping GPU check." |
| | return 0 |
| | fi |
| |
|
| | local avg_util |
| | avg_util=$(nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader,nounits 2>/dev/null \ |
| | | awk '{sum+=$1; count++} END {if(count>0) printf "%.0f", sum/count; else print 0}') |
| |
|
| | if [[ -z "$avg_util" || "$avg_util" == "0" ]]; then |
| | log_msg "GPU util query returned 0 or empty β possibly all idle." |
| | |
| | local pid |
| | pid=$(cat "$PID_FILE" 2>/dev/null | tr -d '[:space:]') |
| | if [[ -n "$pid" ]] && kill -0 "$pid" 2>/dev/null; then |
| | send_alert "GPU_IDLE" "All 8Γ B200 GPUs show <b>0% utilization</b> while training process is alive. |
| | Possible NCCL hang or data pipeline stall." |
| | return 1 |
| | fi |
| | return 0 |
| | fi |
| |
|
| | if (( avg_util < GPU_UTIL_WARN_PCT )); then |
| | local gpu_details |
| | gpu_details=$(nvidia-smi --query-gpu=index,utilization.gpu,memory.used,memory.total \ |
| | --format=csv,noheader 2>/dev/null | head -8 || echo "unavailable") |
| | send_alert "GPU_LOW" "Average GPU utilization: <b>${avg_util}%</b> (threshold: ${GPU_UTIL_WARN_PCT}%). |
| | |
| | GPU details: |
| | <code>${gpu_details}</code>" |
| | return 1 |
| | fi |
| |
|
| | log_msg "GPU utilization OK: ${avg_util}% average." |
| | return 0 |
| | } |
| |
|
| | |
| | check_disk() { |
| | local usage_pct |
| | usage_pct=$(df "$CKPT_DIR" 2>/dev/null | awk 'NR==2 {gsub(/%/,"",$5); print $5}') |
| |
|
| | if [[ -z "$usage_pct" ]]; then |
| | log_msg "Could not determine disk usage for $CKPT_DIR." |
| | return 0 |
| | fi |
| |
|
| | if (( usage_pct >= DISK_WARN_PCT )); then |
| | local avail |
| | avail=$(df -h "$CKPT_DIR" 2>/dev/null | awk 'NR==2 {print $4}') |
| | send_alert "DISK" "Disk usage at <b>${usage_pct}%</b> (threshold: ${DISK_WARN_PCT}%). |
| | Available: <b>${avail}</b> on partition containing checkpoints. |
| | |
| | Risk: checkpoint saves may fail. Consider deleting old checkpoints." |
| | return 1 |
| | fi |
| |
|
| | log_msg "Disk usage OK: ${usage_pct}% used." |
| | return 0 |
| | } |
| |
|
| | |
| | main() { |
| | log_msg "=== Watchdog check START ===" |
| |
|
| | local issues=0 |
| |
|
| | check_process || (( issues++ )) || true |
| | check_stall || (( issues++ )) || true |
| | check_loss || (( issues++ )) || true |
| | check_throughput || (( issues++ )) || true |
| | check_gpu || (( issues++ )) || true |
| | check_disk || (( issues++ )) || true |
| |
|
| | if (( issues == 0 )); then |
| | log_msg "All checks passed β no alerts sent." |
| | else |
| | log_msg "Watchdog found $issues issue(s) β alerts sent." |
| | fi |
| |
|
| | log_msg "=== Watchdog check END ===" |
| | } |
| |
|
| | main "$@" |
| |
|