| | #!/usr/bin/env bash |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | set -euo pipefail |
| |
|
| | |
| | LOG_FILE="${1:-checkpoints/korean_1b_sft/train.log}" |
| | CHECK_INTERVAL=30 |
| | ZERO_LOSS_THRESHOLD=3 |
| | GNORM_WARN=10.0 |
| | GNORM_CRITICAL=50.0 |
| | LOSS_SPIKE_FACTOR=3.0 |
| | STALL_TIMEOUT=300 |
| | DISK_WARN_PCT=80 |
| | GPU_UTIL_WARN=50 |
| | CHECK_ONCE=false |
| |
|
| | if [[ "${1:-}" == "--check-once" ]]; then |
| | CHECK_ONCE=true |
| | LOG_FILE="${2:-checkpoints/korean_1b_sft/train.log}" |
| | fi |
| |
|
| | |
| | RED='\033[0;31m' |
| | YELLOW='\033[1;33m' |
| | GREEN='\033[0;32m' |
| | CYAN='\033[0;36m' |
| | NC='\033[0m' |
| |
|
| | |
| | timestamp() { date '+%Y-%m-%d %H:%M:%S'; } |
| |
|
| | alert() { |
| | local level="$1" msg="$2" |
| | case "$level" in |
| | CRITICAL) echo -e "${RED}๐ด [$(timestamp)] [CRITICAL] ${msg}${NC}" ;; |
| | WARNING) echo -e "${YELLOW}๐ [$(timestamp)] [WARNING] ${msg}${NC}" ;; |
| | INFO) echo -e "${CYAN}๐ก [$(timestamp)] [INFO] ${msg}${NC}" ;; |
| | OK) echo -e "${GREEN}โ
[$(timestamp)] [OK] ${msg}${NC}" ;; |
| | esac |
| | } |
| |
|
| | |
| | parse_metrics() { |
| | |
| | local n="${1:-20}" |
| | if [[ ! -f "$LOG_FILE" ]]; then |
| | echo "" |
| | return |
| | fi |
| | tail -n "$n" "$LOG_FILE" | grep "step.*loss.*gnorm" || true |
| | } |
| |
|
| | extract_field() { |
| | |
| | echo "$1" | grep -oP "${2}\s+\K[0-9]+\.[0-9e+\-]+" | head -1 |
| | } |
| |
|
| | |
| |
|
| | check_loss_zero() { |
| | local lines |
| | lines=$(parse_metrics "$ZERO_LOSS_THRESHOLD") |
| | if [[ -z "$lines" ]]; then return; fi |
| |
|
| | local zero_count=0 |
| | while IFS= read -r line; do |
| | local loss |
| | loss=$(extract_field "$line" "loss") |
| | if [[ -n "$loss" ]]; then |
| | |
| | if (( $(echo "$loss < 0.001" | bc -l 2>/dev/null || echo 0) )); then |
| | ((zero_count++)) |
| | fi |
| | fi |
| | done <<< "$lines" |
| |
|
| | if [[ $zero_count -ge $ZERO_LOSS_THRESHOLD ]]; then |
| | alert CRITICAL "Loss๊ฐ ${zero_count}ํ ์ฐ์ ~0! Labels ๋ฒ๊ทธ ๊ฐ๋ฅ์ฑ. ์ฆ์ ํ์ต ์ค๋จ!" |
| | return 1 |
| | fi |
| | return 0 |
| | } |
| |
|
| | check_loss_spike() { |
| | local lines |
| | lines=$(parse_metrics 20) |
| | if [[ -z "$lines" ]]; then return 0; fi |
| |
|
| | local losses=() |
| | while IFS= read -r line; do |
| | local loss |
| | loss=$(extract_field "$line" "loss") |
| | [[ -n "$loss" ]] && losses+=("$loss") |
| | done <<< "$lines" |
| |
|
| | local count=${#losses[@]} |
| | if [[ $count -lt 5 ]]; then return 0; fi |
| |
|
| | |
| | local last_loss="${losses[$((count-1))]}" |
| | local sum=0 |
| | for ((i=0; i<count-1; i++)); do |
| | sum=$(echo "$sum + ${losses[$i]}" | bc -l 2>/dev/null || echo "$sum") |
| | done |
| | local avg=$(echo "$sum / ($count - 1)" | bc -l 2>/dev/null || echo "0") |
| |
|
| | if [[ "$avg" != "0" ]]; then |
| | local ratio=$(echo "$last_loss / $avg" | bc -l 2>/dev/null || echo "1") |
| | if (( $(echo "$ratio > $LOSS_SPIKE_FACTOR" | bc -l 2>/dev/null || echo 0) )); then |
| | alert WARNING "Loss spike ๊ฐ์ง! ํ์ฌ=${last_loss}, ํ๊ท =${avg}, ๋น์จ=${ratio}x" |
| | fi |
| | fi |
| | return 0 |
| | } |
| |
|
| | check_gnorm() { |
| | local lines |
| | lines=$(parse_metrics 5) |
| | if [[ -z "$lines" ]]; then return 0; fi |
| |
|
| | local last_line |
| | last_line=$(echo "$lines" | tail -1) |
| | local gnorm |
| | gnorm=$(extract_field "$last_line" "gnorm") |
| |
|
| | if [[ -z "$gnorm" ]]; then return 0; fi |
| |
|
| | if (( $(echo "$gnorm > $GNORM_CRITICAL" | bc -l 2>/dev/null || echo 0) )); then |
| | alert CRITICAL "GNorm=${gnorm} > ${GNORM_CRITICAL}! ๋ฐ์ฐ ์ง์ . ํ์ต ์ค๋จ ๊ณ ๋ ค." |
| | elif (( $(echo "$gnorm > $GNORM_WARN" | bc -l 2>/dev/null || echo 0) )); then |
| | alert WARNING "GNorm=${gnorm} > ${GNORM_WARN}. ๋ถ์์ ์งํ." |
| | fi |
| | return 0 |
| | } |
| |
|
| | check_stall() { |
| | if [[ ! -f "$LOG_FILE" ]]; then |
| | alert INFO "๋ก๊ทธ ํ์ผ ์์: ${LOG_FILE}" |
| | return 0 |
| | fi |
| |
|
| | local last_modified |
| | last_modified=$(stat -c %Y "$LOG_FILE" 2>/dev/null || echo 0) |
| | local now |
| | now=$(date +%s) |
| | local diff=$((now - last_modified)) |
| |
|
| | if [[ $diff -gt $STALL_TIMEOUT ]]; then |
| | alert CRITICAL "๋ก๊ทธ๊ฐ ${diff}์ด ($(( diff/60 ))๋ถ) ๋์ ์
๋ฐ์ดํธ ์์! Hang ๊ฐ๋ฅ์ฑ." |
| | fi |
| | return 0 |
| | } |
| |
|
| | check_disk() { |
| | local usage |
| | usage=$(df /PROJECT 2>/dev/null | awk 'NR==2 {print $5}' | tr -d '%') |
| | if [[ -n "$usage" && "$usage" -gt "$DISK_WARN_PCT" ]]; then |
| | alert WARNING "๋์คํฌ ์ฌ์ฉ๋ฅ ${usage}% > ${DISK_WARN_PCT}%. ์ฒดํฌํฌ์ธํธ ์ ๋ฆฌ ํ์." |
| | fi |
| | return 0 |
| | } |
| |
|
| | check_gpu() { |
| | if ! command -v nvidia-smi &>/dev/null; then return 0; fi |
| |
|
| | local low_util=0 |
| | local total_gpus=0 |
| | while IFS= read -r util; do |
| | ((total_gpus++)) |
| | if [[ "$util" -lt "$GPU_UTIL_WARN" ]]; then |
| | ((low_util++)) |
| | fi |
| | done < <(nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader,nounits 2>/dev/null) |
| |
|
| | if [[ $total_gpus -gt 0 && $low_util -gt 0 ]]; then |
| | alert INFO "${low_util}/${total_gpus} GPU utilization < ${GPU_UTIL_WARN}%. ๋ฐ์ดํฐ ๋ก๋ฉ ๋ณ๋ชฉ?" |
| | fi |
| | return 0 |
| | } |
| |
|
| | |
| | print_status() { |
| | local lines |
| | lines=$(parse_metrics 1) |
| | if [[ -n "$lines" ]]; then |
| | echo -e "${GREEN}์ต๊ทผ ๋ก๊ทธ:${NC} $lines" |
| | fi |
| |
|
| | if command -v nvidia-smi &>/dev/null; then |
| | echo -e "${CYAN}GPU ๋ฉ๋ชจ๋ฆฌ:${NC}" |
| | nvidia-smi --query-gpu=index,memory.used,memory.total,utilization.gpu \ |
| | --format=csv,noheader 2>/dev/null | head -8 |
| | fi |
| |
|
| | local disk |
| | disk=$(df -h /PROJECT 2>/dev/null | awk 'NR==2 {print "์ฌ์ฉ: "$3"/"$2" ("$5")"}') |
| | echo -e "${CYAN}๋์คํฌ:${NC} ${disk}" |
| | } |
| |
|
| | |
| | echo "==================================================================" |
| | echo " SFT Training Monitor" |
| | echo " Log file: ${LOG_FILE}" |
| | echo " Check interval: ${CHECK_INTERVAL}s" |
| | echo " Press Ctrl+C to stop" |
| | echo "==================================================================" |
| |
|
| | run_all_checks() { |
| | check_loss_zero || true |
| | check_loss_spike || true |
| | check_gnorm || true |
| | check_stall || true |
| | check_disk || true |
| | check_gpu || true |
| | echo "---" |
| | print_status |
| | echo "" |
| | } |
| |
|
| | if $CHECK_ONCE; then |
| | run_all_checks |
| | exit 0 |
| | fi |
| |
|
| | while true; do |
| | run_all_checks |
| | sleep "$CHECK_INTERVAL" |
| | done |
| |
|