#!/usr/bin/env bash # ============================================================================= # monitor_3b.sh — 3B 학습 실시간 모니터링 + 이상 감지 + 자동 체크포인트 정리 # # Usage: # bash scripts/monitor_3b.sh # 기본 감시 # bash scripts/monitor_3b.sh --check-once # 1회 검사 # bash scripts/monitor_3b.sh --auto-cleanup # 자동 오래된 체크포인트 삭제 # # 3B 특화 사항: # - 체크포인트 27GB/개 → 디스크 감시 강화 # - NCCL hang 감지 + 자동 재시작 옵션 # - 예상 완료 시간 실시간 계산 # - 프로세스 중복 실행 방지 # ============================================================================= set -euo pipefail # ---- Configuration ---------------------------------------------------------- RUN_NAME="${RUN_NAME:-korean_3b_fp8_run1}" LOG_FILE="${1:-checkpoints/${RUN_NAME}/train.log}" CKPT_DIR="checkpoints/${RUN_NAME}" CHECK_INTERVAL=60 # 3B는 step 간격 더 김 → 60초 ZERO_LOSS_THRESHOLD=3 GNORM_WARN=10.0 GNORM_CRITICAL=50.0 LOSS_SPIKE_FACTOR=3.0 STALL_TIMEOUT=600 # 10분 (3B는 step 더 오래 걸림) DISK_WARN_PCT=85 DISK_CRITICAL_PCT=92 GPU_UTIL_WARN=50 MAX_CHECKPOINTS=15 # 최대 보관 체크포인트 수 (15 × 27GB = 405GB) CHECK_ONCE=false AUTO_CLEANUP=false AUTO_RESTART=false # Parse args for arg in "$@"; do case "$arg" in --check-once) CHECK_ONCE=true ;; --auto-cleanup) AUTO_CLEANUP=true ;; --auto-restart) AUTO_RESTART=true ;; esac done # Fix LOG_FILE if first arg was a flag if [[ "$LOG_FILE" == --* ]]; then LOG_FILE="checkpoints/${RUN_NAME}/train.log" fi # ---- Colors ----------------------------------------------------------------- RED='\033[0;31m'; YELLOW='\033[1;33m'; GREEN='\033[0;32m' CYAN='\033[0;36m'; MAGENTA='\033[0;35m'; NC='\033[0m' timestamp() { date '+%Y-%m-%d %H:%M:%S'; } alert() { local level="$1" msg="$2" case "$level" in CRITICAL) echo -e "${RED}🔴 [$(timestamp)] [CRITICAL] ${msg}${NC}" ;; WARNING) echo -e "${YELLOW}🟠 [$(timestamp)] [WARNING] ${msg}${NC}" ;; INFO) echo -e "${CYAN}🟡 [$(timestamp)] [INFO] ${msg}${NC}" ;; OK) echo -e "${GREEN}✅ [$(timestamp)] [OK] ${msg}${NC}" ;; esac } # ---- Parse metrics ---------------------------------------------------------- parse_metrics() { local n="${1:-20}" [[ -f "$LOG_FILE" ]] || return tail -n "$n" "$LOG_FILE" | grep "step.*loss.*gnorm" || true } extract_field() { echo "$1" | grep -oP "${2}\s+\K[0-9]+\.[0-9e+\-]+" | head -1 } extract_step() { echo "$1" | grep -oP "step\s+\K[0-9]+" | head -1 } # ---- Check: Loss = 0 ------------------------------------------------------- check_loss_zero() { local lines lines=$(parse_metrics "$ZERO_LOSS_THRESHOLD") [[ -z "$lines" ]] && return 0 local zero_count=0 while IFS= read -r line; do local loss=$(extract_field "$line" "loss") if [[ -n "$loss" ]] && (( $(echo "$loss < 0.001" | bc -l 2>/dev/null || echo 0) )); then ((zero_count++)) fi done <<< "$lines" if [[ $zero_count -ge $ZERO_LOSS_THRESHOLD ]]; then alert CRITICAL "Loss가 ${zero_count}회 연속 ~0! Labels 버그. 즉시 중단!" return 1 fi } # ---- Check: Loss spike ----------------------------------------------------- check_loss_spike() { local lines=$(parse_metrics 20) [[ -z "$lines" ]] && return 0 local losses=() while IFS= read -r line; do local loss=$(extract_field "$line" "loss") [[ -n "$loss" ]] && losses+=("$loss") done <<< "$lines" local count=${#losses[@]} [[ $count -lt 5 ]] && return 0 local last="${losses[$((count-1))]}" local sum=0 for ((i=0; i/dev/null || echo "$sum") done local avg=$(echo "$sum / ($count - 1)" | bc -l 2>/dev/null || echo "0") if [[ "$avg" != "0" ]]; then local ratio=$(echo "$last / $avg" | bc -l 2>/dev/null || echo "1") if (( $(echo "$ratio > $LOSS_SPIKE_FACTOR" | bc -l 2>/dev/null || echo 0) )); then alert WARNING "Loss spike! 현재=${last}, 평균=${avg}, 비율=${ratio}x" fi fi } # ---- Check: Gradient norm --------------------------------------------------- check_gnorm() { local lines=$(parse_metrics 5) [[ -z "$lines" ]] && return 0 local gnorm=$(extract_field "$(echo "$lines" | tail -1)" "gnorm") [[ -z "$gnorm" ]] && return 0 if (( $(echo "$gnorm > $GNORM_CRITICAL" | bc -l 2>/dev/null || echo 0) )); then alert CRITICAL "GNorm=${gnorm} > ${GNORM_CRITICAL}! 발산 직전." elif (( $(echo "$gnorm > $GNORM_WARN" | bc -l 2>/dev/null || echo 0) )); then alert WARNING "GNorm=${gnorm} 불안정." fi } # ---- Check: Stall / NCCL hang ---------------------------------------------- check_stall() { [[ ! -f "$LOG_FILE" ]] && return 0 local last_mod=$(stat -c %Y "$LOG_FILE" 2>/dev/null || echo 0) local now=$(date +%s) local diff=$((now - last_mod)) if [[ $diff -gt $STALL_TIMEOUT ]]; then alert CRITICAL "로그 ${diff}초 ($(( diff/60 ))분) 멈춤! NCCL hang 가능성." # NCCL hang 자동 재시작 if $AUTO_RESTART; then alert WARNING "자동 재시작 시도..." local pid=$(pgrep -f "pretrain.py.*korean_3b" | head -1 || true) if [[ -n "$pid" ]]; then kill -9 "$pid" 2>/dev/null || true sleep 5 alert INFO "이전 프로세스 종료. launch_3b_pretrain.sh 재실행 필요." fi fi fi } # ---- Check: Disk (3B 강화) -------------------------------------------------- check_disk() { local usage=$(df /PROJECT 2>/dev/null | awk 'NR==2 {print $5}' | tr -d '%') if [[ -n "$usage" && "$usage" -gt "$DISK_CRITICAL_PCT" ]]; then alert CRITICAL "디스크 ${usage}% > ${DISK_CRITICAL_PCT}%! 즉시 정리 필요!" $AUTO_CLEANUP && cleanup_old_checkpoints elif [[ -n "$usage" && "$usage" -gt "$DISK_WARN_PCT" ]]; then alert WARNING "디스크 ${usage}% > ${DISK_WARN_PCT}%. 체크포인트 정리 권장." fi } # ---- Check: GPU utilization ------------------------------------------------- check_gpu() { command -v nvidia-smi &>/dev/null || return 0 local low=0 total=0 while IFS= read -r util; do ((total++)) [[ "$util" -lt "$GPU_UTIL_WARN" ]] && ((low++)) done < <(nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader,nounits 2>/dev/null) [[ $total -gt 0 && $low -gt 0 ]] && alert INFO "${low}/${total} GPU util < ${GPU_UTIL_WARN}%" } # ---- Check: 체크포인트 무결성 ----------------------------------------------- check_checkpoint_integrity() { local latest=$(ls -d "${CKPT_DIR}"/checkpoint-* 2>/dev/null | sort -V | tail -1 || true) [[ -z "$latest" ]] && return 0 # 최소 파일 존재 확인 if [[ ! -f "${latest}/model.pt" ]] && [[ ! -f "${latest}/model.safetensors" ]]; then alert WARNING "최근 체크포인트에 모델 파일 없음: ${latest}" fi # 크기 확인 (3B model.pt는 최소 2GB) local size=$(du -sb "${latest}" 2>/dev/null | awk '{print $1}') if [[ -n "$size" && "$size" -lt 2000000000 ]]; then alert WARNING "체크포인트 크기 비정상 (${size} bytes < 2GB): ${latest}" fi } # ---- Cleanup: 오래된 체크포인트 자동 삭제 ------------------------------------ cleanup_old_checkpoints() { local ckpts=($(ls -d "${CKPT_DIR}"/checkpoint-* 2>/dev/null | sort -V)) local count=${#ckpts[@]} if [[ $count -le $MAX_CHECKPOINTS ]]; then alert OK "체크포인트 ${count}개 ≤ ${MAX_CHECKPOINTS}. 정리 불필요." return fi # 이정표 체크포인트 보존 (매 10K step) local deletable=() local preserved=() for ckpt in "${ckpts[@]}"; do local step_num=$(basename "$ckpt" | grep -oP '\d+' || echo "0") if (( step_num % 10000 == 0 && step_num > 0 )); then preserved+=("$ckpt") else deletable+=("$ckpt") fi done # 최근 MAX_CHECKPOINTS개는 무조건 보존 local n_deletable=${#deletable[@]} local total_keep=$(( ${#preserved[@]} + MAX_CHECKPOINTS )) local to_delete=$(( count - total_keep )) [[ $to_delete -le 0 ]] && { alert OK "정리 불필요 (이정표 ${#preserved[@]}개 + 최근 ${MAX_CHECKPOINTS}개 보존)."; return; } alert INFO "${count}개 체크포인트 → ${to_delete}개 삭제 (이정표 ${#preserved[@]}개 영구 보존)" local deleted=0 for ckpt in "${deletable[@]}"; do [[ $deleted -ge $to_delete ]] && break local ckpt_size=$(du -sh "$ckpt" 2>/dev/null | awk '{print $1}') echo " 삭제: $ckpt (${ckpt_size})" rm -rf "$ckpt" ((deleted++)) done alert OK "체크포인트 정리 완료. (${deleted}개 삭제)" } # ---- ETA 계산 --------------------------------------------------------------- estimate_eta() { [[ ! -f "$LOG_FILE" ]] && return # 최근 step 번호 + 시간 local lines=$(parse_metrics 50) [[ -z "$lines" ]] && return local last_line=$(echo "$lines" | tail -1) local first_line=$(echo "$lines" | head -1) local cur_step=$(extract_step "$last_line") local max_steps=$(grep -oP "max_steps.*?(\d+)" "${CKPT_DIR}/train.log" 2>/dev/null | head -1 | grep -oP '\d+$' || echo "57000") [[ -z "$cur_step" || "$cur_step" == "0" ]] && return # step/sec from log timestamps (approximate) local remaining=$((max_steps - cur_step)) if [[ $remaining -le 0 ]]; then echo -e "${MAGENTA}📊 진행: ${cur_step}/${max_steps} (완료!)${NC}" return fi # 파일 수정 시간 기반 rough ETA local first_time=$(head -20 "$LOG_FILE" 2>/dev/null | grep -oP '^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}' | head -1 || true) if [[ -n "$first_time" ]]; then local start_epoch=$(date -d "$first_time" +%s 2>/dev/null || echo 0) local now=$(date +%s) if [[ $start_epoch -gt 0 && $cur_step -gt 0 ]]; then local elapsed=$((now - start_epoch)) local sec_per_step=$(echo "$elapsed / $cur_step" | bc -l 2>/dev/null || echo "0") local eta_sec=$(echo "$remaining * $sec_per_step" | bc 2>/dev/null | cut -d. -f1 || echo "0") local eta_hours=$(echo "$eta_sec / 3600" | bc 2>/dev/null || echo "?") local pct=$(echo "scale=1; $cur_step * 100 / $max_steps" | bc 2>/dev/null || echo "?") echo -e "${MAGENTA}📊 진행: ${cur_step}/${max_steps} (${pct}%) | 남은 시간: ~${eta_hours}h | ${sec_per_step}s/step${NC}" fi else echo -e "${MAGENTA}📊 진행: ${cur_step}/${max_steps}${NC}" fi } # ---- Status summary --------------------------------------------------------- print_status() { local lines=$(parse_metrics 1) [[ -n "$lines" ]] && echo -e "${GREEN}최근:${NC} $lines" estimate_eta if command -v nvidia-smi &>/dev/null; then echo -e "${CYAN}GPU:${NC}" nvidia-smi --query-gpu=index,memory.used,memory.total,utilization.gpu,temperature.gpu \ --format=csv,noheader 2>/dev/null | head -8 fi local ckpt_count=$(ls -d "${CKPT_DIR}"/checkpoint-* 2>/dev/null | wc -l) local ckpt_size=$(du -sh "${CKPT_DIR}" 2>/dev/null | awk '{print $1}') echo -e "${CYAN}체크포인트:${NC} ${ckpt_count}개 (${ckpt_size})" local disk=$(df -h /PROJECT 2>/dev/null | awk 'NR==2 {print $3"/"$2" ("$5")"}') echo -e "${CYAN}디스크:${NC} ${disk}" } # ---- Main ------------------------------------------------------------------- echo "==================================================================" echo " 3B Training Monitor" echo " Run: ${RUN_NAME}" echo " Log: ${LOG_FILE}" echo " Interval: ${CHECK_INTERVAL}s" echo " Auto-cleanup: ${AUTO_CLEANUP} | Auto-restart: ${AUTO_RESTART}" echo " Ctrl+C to stop" echo "==================================================================" run_all_checks() { check_loss_zero || true check_loss_spike || true check_gnorm || true check_stall || true check_disk || true check_gpu || true check_checkpoint_integrity || true echo "---" print_status echo "" } if $CHECK_ONCE; then run_all_checks exit 0 fi while true; do run_all_checks sleep "$CHECK_INTERVAL" done