| #!/usr/bin/env bash |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| set -euo pipefail |
|
|
| |
| RUN_NAME="${RUN_NAME:-korean_3b_fp8_run1}" |
| LOG_FILE="${1:-checkpoints/${RUN_NAME}/train.log}" |
| CKPT_DIR="checkpoints/${RUN_NAME}" |
| CHECK_INTERVAL=60 |
| ZERO_LOSS_THRESHOLD=3 |
| GNORM_WARN=10.0 |
| GNORM_CRITICAL=50.0 |
| LOSS_SPIKE_FACTOR=3.0 |
| STALL_TIMEOUT=600 |
| DISK_WARN_PCT=85 |
| DISK_CRITICAL_PCT=92 |
| GPU_UTIL_WARN=50 |
| MAX_CHECKPOINTS=15 |
| CHECK_ONCE=false |
| AUTO_CLEANUP=false |
| AUTO_RESTART=false |
|
|
| |
| for arg in "$@"; do |
| case "$arg" in |
| --check-once) CHECK_ONCE=true ;; |
| --auto-cleanup) AUTO_CLEANUP=true ;; |
| --auto-restart) AUTO_RESTART=true ;; |
| esac |
| done |
| |
| if [[ "$LOG_FILE" == --* ]]; then |
| LOG_FILE="checkpoints/${RUN_NAME}/train.log" |
| fi |
|
|
| |
| RED='\033[0;31m'; YELLOW='\033[1;33m'; GREEN='\033[0;32m' |
| CYAN='\033[0;36m'; MAGENTA='\033[0;35m'; NC='\033[0m' |
|
|
| timestamp() { date '+%Y-%m-%d %H:%M:%S'; } |
|
|
| alert() { |
| local level="$1" msg="$2" |
| case "$level" in |
| CRITICAL) echo -e "${RED}🔴 [$(timestamp)] [CRITICAL] ${msg}${NC}" ;; |
| WARNING) echo -e "${YELLOW}🟠 [$(timestamp)] [WARNING] ${msg}${NC}" ;; |
| INFO) echo -e "${CYAN}🟡 [$(timestamp)] [INFO] ${msg}${NC}" ;; |
| OK) echo -e "${GREEN}✅ [$(timestamp)] [OK] ${msg}${NC}" ;; |
| esac |
| } |
|
|
| |
| parse_metrics() { |
| local n="${1:-20}" |
| [[ -f "$LOG_FILE" ]] || return |
| tail -n "$n" "$LOG_FILE" | grep "step.*loss.*gnorm" || true |
| } |
|
|
| extract_field() { |
| echo "$1" | grep -oP "${2}\s+\K[0-9]+\.[0-9e+\-]+" | head -1 |
| } |
|
|
| extract_step() { |
| echo "$1" | grep -oP "step\s+\K[0-9]+" | head -1 |
| } |
|
|
| |
| check_loss_zero() { |
| local lines |
| lines=$(parse_metrics "$ZERO_LOSS_THRESHOLD") |
| [[ -z "$lines" ]] && return 0 |
| local zero_count=0 |
| while IFS= read -r line; do |
| local loss=$(extract_field "$line" "loss") |
| if [[ -n "$loss" ]] && (( $(echo "$loss < 0.001" | bc -l 2>/dev/null || echo 0) )); then |
| ((zero_count++)) |
| fi |
| done <<< "$lines" |
| if [[ $zero_count -ge $ZERO_LOSS_THRESHOLD ]]; then |
| alert CRITICAL "Loss가 ${zero_count}회 연속 ~0! Labels 버그. 즉시 중단!" |
| return 1 |
| fi |
| } |
|
|
| |
| check_loss_spike() { |
| local lines=$(parse_metrics 20) |
| [[ -z "$lines" ]] && return 0 |
| local losses=() |
| while IFS= read -r line; do |
| local loss=$(extract_field "$line" "loss") |
| [[ -n "$loss" ]] && losses+=("$loss") |
| done <<< "$lines" |
| local count=${#losses[@]} |
| [[ $count -lt 5 ]] && return 0 |
| local last="${losses[$((count-1))]}" |
| local sum=0 |
| for ((i=0; i<count-1; i++)); do |
| sum=$(echo "$sum + ${losses[$i]}" | bc -l 2>/dev/null || echo "$sum") |
| done |
| local avg=$(echo "$sum / ($count - 1)" | bc -l 2>/dev/null || echo "0") |
| if [[ "$avg" != "0" ]]; then |
| local ratio=$(echo "$last / $avg" | bc -l 2>/dev/null || echo "1") |
| if (( $(echo "$ratio > $LOSS_SPIKE_FACTOR" | bc -l 2>/dev/null || echo 0) )); then |
| alert WARNING "Loss spike! 현재=${last}, 평균=${avg}, 비율=${ratio}x" |
| fi |
| fi |
| } |
|
|
| |
| check_gnorm() { |
| local lines=$(parse_metrics 5) |
| [[ -z "$lines" ]] && return 0 |
| local gnorm=$(extract_field "$(echo "$lines" | tail -1)" "gnorm") |
| [[ -z "$gnorm" ]] && return 0 |
| if (( $(echo "$gnorm > $GNORM_CRITICAL" | bc -l 2>/dev/null || echo 0) )); then |
| alert CRITICAL "GNorm=${gnorm} > ${GNORM_CRITICAL}! 발산 직전." |
| elif (( $(echo "$gnorm > $GNORM_WARN" | bc -l 2>/dev/null || echo 0) )); then |
| alert WARNING "GNorm=${gnorm} 불안정." |
| fi |
| } |
|
|
| |
| check_stall() { |
| [[ ! -f "$LOG_FILE" ]] && return 0 |
| local last_mod=$(stat -c %Y "$LOG_FILE" 2>/dev/null || echo 0) |
| local now=$(date +%s) |
| local diff=$((now - last_mod)) |
| if [[ $diff -gt $STALL_TIMEOUT ]]; then |
| alert CRITICAL "로그 ${diff}초 ($(( diff/60 ))분) 멈춤! NCCL hang 가능성." |
| |
| if $AUTO_RESTART; then |
| alert WARNING "자동 재시작 시도..." |
| local pid=$(pgrep -f "pretrain.py.*korean_3b" | head -1 || true) |
| if [[ -n "$pid" ]]; then |
| kill -9 "$pid" 2>/dev/null || true |
| sleep 5 |
| alert INFO "이전 프로세스 종료. launch_3b_pretrain.sh 재실행 필요." |
| fi |
| fi |
| fi |
| } |
|
|
| |
| check_disk() { |
| local usage=$(df /PROJECT 2>/dev/null | awk 'NR==2 {print $5}' | tr -d '%') |
| if [[ -n "$usage" && "$usage" -gt "$DISK_CRITICAL_PCT" ]]; then |
| alert CRITICAL "디스크 ${usage}% > ${DISK_CRITICAL_PCT}%! 즉시 정리 필요!" |
| $AUTO_CLEANUP && cleanup_old_checkpoints |
| elif [[ -n "$usage" && "$usage" -gt "$DISK_WARN_PCT" ]]; then |
| alert WARNING "디스크 ${usage}% > ${DISK_WARN_PCT}%. 체크포인트 정리 권장." |
| fi |
| } |
|
|
| |
| check_gpu() { |
| command -v nvidia-smi &>/dev/null || return 0 |
| local low=0 total=0 |
| while IFS= read -r util; do |
| ((total++)) |
| [[ "$util" -lt "$GPU_UTIL_WARN" ]] && ((low++)) |
| done < <(nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader,nounits 2>/dev/null) |
| [[ $total -gt 0 && $low -gt 0 ]] && alert INFO "${low}/${total} GPU util < ${GPU_UTIL_WARN}%" |
| } |
|
|
| |
| check_checkpoint_integrity() { |
| local latest=$(ls -d "${CKPT_DIR}"/checkpoint-* 2>/dev/null | sort -V | tail -1 || true) |
| [[ -z "$latest" ]] && return 0 |
| |
| if [[ ! -f "${latest}/model.pt" ]] && [[ ! -f "${latest}/model.safetensors" ]]; then |
| alert WARNING "최근 체크포인트에 모델 파일 없음: ${latest}" |
| fi |
| |
| local size=$(du -sb "${latest}" 2>/dev/null | awk '{print $1}') |
| if [[ -n "$size" && "$size" -lt 2000000000 ]]; then |
| alert WARNING "체크포인트 크기 비정상 (${size} bytes < 2GB): ${latest}" |
| fi |
| } |
|
|
| |
| cleanup_old_checkpoints() { |
| local ckpts=($(ls -d "${CKPT_DIR}"/checkpoint-* 2>/dev/null | sort -V)) |
| local count=${#ckpts[@]} |
| if [[ $count -le $MAX_CHECKPOINTS ]]; then |
| alert OK "체크포인트 ${count}개 ≤ ${MAX_CHECKPOINTS}. 정리 불필요." |
| return |
| fi |
| |
| local deletable=() |
| local preserved=() |
| for ckpt in "${ckpts[@]}"; do |
| local step_num=$(basename "$ckpt" | grep -oP '\d+' || echo "0") |
| if (( step_num % 10000 == 0 && step_num > 0 )); then |
| preserved+=("$ckpt") |
| else |
| deletable+=("$ckpt") |
| fi |
| done |
| |
| local n_deletable=${#deletable[@]} |
| local total_keep=$(( ${#preserved[@]} + MAX_CHECKPOINTS )) |
| local to_delete=$(( count - total_keep )) |
| [[ $to_delete -le 0 ]] && { alert OK "정리 불필요 (이정표 ${#preserved[@]}개 + 최근 ${MAX_CHECKPOINTS}개 보존)."; return; } |
| alert INFO "${count}개 체크포인트 → ${to_delete}개 삭제 (이정표 ${#preserved[@]}개 영구 보존)" |
| local deleted=0 |
| for ckpt in "${deletable[@]}"; do |
| [[ $deleted -ge $to_delete ]] && break |
| local ckpt_size=$(du -sh "$ckpt" 2>/dev/null | awk '{print $1}') |
| echo " 삭제: $ckpt (${ckpt_size})" |
| rm -rf "$ckpt" |
| ((deleted++)) |
| done |
| alert OK "체크포인트 정리 완료. (${deleted}개 삭제)" |
| } |
|
|
| |
| estimate_eta() { |
| [[ ! -f "$LOG_FILE" ]] && return |
| |
| local lines=$(parse_metrics 50) |
| [[ -z "$lines" ]] && return |
| local last_line=$(echo "$lines" | tail -1) |
| local first_line=$(echo "$lines" | head -1) |
| local cur_step=$(extract_step "$last_line") |
| local max_steps=$(grep -oP "max_steps.*?(\d+)" "${CKPT_DIR}/train.log" 2>/dev/null | head -1 | grep -oP '\d+$' || echo "57000") |
|
|
| [[ -z "$cur_step" || "$cur_step" == "0" ]] && return |
|
|
| |
| local remaining=$((max_steps - cur_step)) |
| if [[ $remaining -le 0 ]]; then |
| echo -e "${MAGENTA}📊 진행: ${cur_step}/${max_steps} (완료!)${NC}" |
| return |
| fi |
|
|
| |
| local first_time=$(head -20 "$LOG_FILE" 2>/dev/null | grep -oP '^\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}' | head -1 || true) |
| if [[ -n "$first_time" ]]; then |
| local start_epoch=$(date -d "$first_time" +%s 2>/dev/null || echo 0) |
| local now=$(date +%s) |
| if [[ $start_epoch -gt 0 && $cur_step -gt 0 ]]; then |
| local elapsed=$((now - start_epoch)) |
| local sec_per_step=$(echo "$elapsed / $cur_step" | bc -l 2>/dev/null || echo "0") |
| local eta_sec=$(echo "$remaining * $sec_per_step" | bc 2>/dev/null | cut -d. -f1 || echo "0") |
| local eta_hours=$(echo "$eta_sec / 3600" | bc 2>/dev/null || echo "?") |
| local pct=$(echo "scale=1; $cur_step * 100 / $max_steps" | bc 2>/dev/null || echo "?") |
| echo -e "${MAGENTA}📊 진행: ${cur_step}/${max_steps} (${pct}%) | 남은 시간: ~${eta_hours}h | ${sec_per_step}s/step${NC}" |
| fi |
| else |
| echo -e "${MAGENTA}📊 진행: ${cur_step}/${max_steps}${NC}" |
| fi |
| } |
|
|
| |
| print_status() { |
| local lines=$(parse_metrics 1) |
| [[ -n "$lines" ]] && echo -e "${GREEN}최근:${NC} $lines" |
| estimate_eta |
| if command -v nvidia-smi &>/dev/null; then |
| echo -e "${CYAN}GPU:${NC}" |
| nvidia-smi --query-gpu=index,memory.used,memory.total,utilization.gpu,temperature.gpu \ |
| --format=csv,noheader 2>/dev/null | head -8 |
| fi |
| local ckpt_count=$(ls -d "${CKPT_DIR}"/checkpoint-* 2>/dev/null | wc -l) |
| local ckpt_size=$(du -sh "${CKPT_DIR}" 2>/dev/null | awk '{print $1}') |
| echo -e "${CYAN}체크포인트:${NC} ${ckpt_count}개 (${ckpt_size})" |
| local disk=$(df -h /PROJECT 2>/dev/null | awk 'NR==2 {print $3"/"$2" ("$5")"}') |
| echo -e "${CYAN}디스크:${NC} ${disk}" |
| } |
|
|
| |
| echo "==================================================================" |
| echo " 3B Training Monitor" |
| echo " Run: ${RUN_NAME}" |
| echo " Log: ${LOG_FILE}" |
| echo " Interval: ${CHECK_INTERVAL}s" |
| echo " Auto-cleanup: ${AUTO_CLEANUP} | Auto-restart: ${AUTO_RESTART}" |
| echo " Ctrl+C to stop" |
| echo "==================================================================" |
|
|
| run_all_checks() { |
| check_loss_zero || true |
| check_loss_spike || true |
| check_gnorm || true |
| check_stall || true |
| check_disk || true |
| check_gpu || true |
| check_checkpoint_integrity || true |
| echo "---" |
| print_status |
| echo "" |
| } |
|
|
| if $CHECK_ONCE; then |
| run_all_checks |
| exit 0 |
| fi |
|
|
| while true; do |
| run_all_checks |
| sleep "$CHECK_INTERVAL" |
| done |
|
|