frankenstallm / source /scripts /monitor_training.sh
pathcosmos's picture
Upload folder using huggingface_hub (#17)
48ecd01
#!/usr/bin/env bash
# =============================================================================
# monitor_training.sh โ€” SFT ํ•™์Šต ์‹ค์‹œ๊ฐ„ ๋ชจ๋‹ˆํ„ฐ๋ง + ์ด์ƒ ๊ฐ์ง€
#
# Usage:
# bash scripts/monitor_training.sh # ๊ธฐ๋ณธ ๋กœ๊ทธ ๊ฒฝ๋กœ
# bash scripts/monitor_training.sh /path/to/train.log # ์ปค์Šคํ…€ ๊ฒฝ๋กœ
# bash scripts/monitor_training.sh --check-once # 1ํšŒ ๊ฒ€์‚ฌ ํ›„ ์ข…๋ฃŒ
#
# ๊ฐ์‹œ ํ•ญ๋ชฉ:
# ๐Ÿ”ด loss = 0.0000 (3 step ์—ฐ์†) โ†’ Labels ๋ฒ„๊ทธ
# ๐Ÿ”ด gnorm > 50.0 โ†’ ๋ฐœ์‚ฐ ์ง์ „
# ๐Ÿ”ด ๋กœ๊ทธ 5๋ถ„ ์ด์ƒ ๋ฉˆ์ถค โ†’ Hang
# ๐ŸŸ  loss spike (3ร— ์ด๋™ํ‰๊ท ) โ†’ Bad batch / LR
# ๐ŸŸ  gnorm > 10.0 โ†’ ๋ถˆ์•ˆ์ •
# ๐ŸŸ  ๋””์Šคํฌ > 80% โ†’ ์ •๋ฆฌ ํ•„์š”
# ๐ŸŸก GPU util < 50% โ†’ ๋ณ‘๋ชฉ
# =============================================================================
set -euo pipefail
# ---- Configuration ----------------------------------------------------------
LOG_FILE="${1:-checkpoints/korean_1b_sft/train.log}"
CHECK_INTERVAL=30 # ์ดˆ ๋‹จ์œ„ ํด๋ง ๊ฐ„๊ฒฉ
ZERO_LOSS_THRESHOLD=3 # NํšŒ ์—ฐ์† loss=0์ด๋ฉด ๊ฒฝ๊ณ 
GNORM_WARN=10.0
GNORM_CRITICAL=50.0
LOSS_SPIKE_FACTOR=3.0 # ์ด๋™ํ‰๊ท  ๋Œ€๋น„ N๋ฐฐ ์ด์ƒ์ด๋ฉด spike
STALL_TIMEOUT=300 # ์ดˆ (5๋ถ„) ๋กœ๊ทธ ๋ฉˆ์ถค ๊ฐ์ง€
DISK_WARN_PCT=80
GPU_UTIL_WARN=50
CHECK_ONCE=false
if [[ "${1:-}" == "--check-once" ]]; then
CHECK_ONCE=true
LOG_FILE="${2:-checkpoints/korean_1b_sft/train.log}"
fi
# ---- Colors -----------------------------------------------------------------
RED='\033[0;31m'
YELLOW='\033[1;33m'
GREEN='\033[0;32m'
CYAN='\033[0;36m'
NC='\033[0m'
# ---- Helper -----------------------------------------------------------------
timestamp() { date '+%Y-%m-%d %H:%M:%S'; }
alert() {
local level="$1" msg="$2"
case "$level" in
CRITICAL) echo -e "${RED}๐Ÿ”ด [$(timestamp)] [CRITICAL] ${msg}${NC}" ;;
WARNING) echo -e "${YELLOW}๐ŸŸ  [$(timestamp)] [WARNING] ${msg}${NC}" ;;
INFO) echo -e "${CYAN}๐ŸŸก [$(timestamp)] [INFO] ${msg}${NC}" ;;
OK) echo -e "${GREEN}โœ… [$(timestamp)] [OK] ${msg}${NC}" ;;
esac
}
# ---- Parse last N log lines -------------------------------------------------
parse_metrics() {
# ๋กœ๊ทธ ํ˜•์‹: [timestamp] [INFO] step XXXX | loss X.XXXX | lr X.XXe-XX | gnorm X.XXX | ...
local n="${1:-20}"
if [[ ! -f "$LOG_FILE" ]]; then
echo ""
return
fi
tail -n "$n" "$LOG_FILE" | grep "step.*loss.*gnorm" || true
}
extract_field() {
# $1=line, $2=field name (loss, gnorm, lr)
echo "$1" | grep -oP "${2}\s+\K[0-9]+\.[0-9e+\-]+" | head -1
}
# ---- Check functions --------------------------------------------------------
check_loss_zero() {
local lines
lines=$(parse_metrics "$ZERO_LOSS_THRESHOLD")
if [[ -z "$lines" ]]; then return; fi
local zero_count=0
while IFS= read -r line; do
local loss
loss=$(extract_field "$line" "loss")
if [[ -n "$loss" ]]; then
# loss < 0.001
if (( $(echo "$loss < 0.001" | bc -l 2>/dev/null || echo 0) )); then
((zero_count++))
fi
fi
done <<< "$lines"
if [[ $zero_count -ge $ZERO_LOSS_THRESHOLD ]]; then
alert CRITICAL "Loss๊ฐ€ ${zero_count}ํšŒ ์—ฐ์† ~0! Labels ๋ฒ„๊ทธ ๊ฐ€๋Šฅ์„ฑ. ์ฆ‰์‹œ ํ•™์Šต ์ค‘๋‹จ!"
return 1
fi
return 0
}
check_loss_spike() {
local lines
lines=$(parse_metrics 20)
if [[ -z "$lines" ]]; then return 0; fi
local losses=()
while IFS= read -r line; do
local loss
loss=$(extract_field "$line" "loss")
[[ -n "$loss" ]] && losses+=("$loss")
done <<< "$lines"
local count=${#losses[@]}
if [[ $count -lt 5 ]]; then return 0; fi
# ๋งˆ์ง€๋ง‰ ๊ฐ’๊ณผ ์ด์ „ ํ‰๊ท  ๋น„๊ต
local last_loss="${losses[$((count-1))]}"
local sum=0
for ((i=0; i<count-1; i++)); do
sum=$(echo "$sum + ${losses[$i]}" | bc -l 2>/dev/null || echo "$sum")
done
local avg=$(echo "$sum / ($count - 1)" | bc -l 2>/dev/null || echo "0")
if [[ "$avg" != "0" ]]; then
local ratio=$(echo "$last_loss / $avg" | bc -l 2>/dev/null || echo "1")
if (( $(echo "$ratio > $LOSS_SPIKE_FACTOR" | bc -l 2>/dev/null || echo 0) )); then
alert WARNING "Loss spike ๊ฐ์ง€! ํ˜„์žฌ=${last_loss}, ํ‰๊ท =${avg}, ๋น„์œจ=${ratio}x"
fi
fi
return 0
}
check_gnorm() {
local lines
lines=$(parse_metrics 5)
if [[ -z "$lines" ]]; then return 0; fi
local last_line
last_line=$(echo "$lines" | tail -1)
local gnorm
gnorm=$(extract_field "$last_line" "gnorm")
if [[ -z "$gnorm" ]]; then return 0; fi
if (( $(echo "$gnorm > $GNORM_CRITICAL" | bc -l 2>/dev/null || echo 0) )); then
alert CRITICAL "GNorm=${gnorm} > ${GNORM_CRITICAL}! ๋ฐœ์‚ฐ ์ง์ „. ํ•™์Šต ์ค‘๋‹จ ๊ณ ๋ ค."
elif (( $(echo "$gnorm > $GNORM_WARN" | bc -l 2>/dev/null || echo 0) )); then
alert WARNING "GNorm=${gnorm} > ${GNORM_WARN}. ๋ถˆ์•ˆ์ • ์ง•ํ›„."
fi
return 0
}
check_stall() {
if [[ ! -f "$LOG_FILE" ]]; then
alert INFO "๋กœ๊ทธ ํŒŒ์ผ ์—†์Œ: ${LOG_FILE}"
return 0
fi
local last_modified
last_modified=$(stat -c %Y "$LOG_FILE" 2>/dev/null || echo 0)
local now
now=$(date +%s)
local diff=$((now - last_modified))
if [[ $diff -gt $STALL_TIMEOUT ]]; then
alert CRITICAL "๋กœ๊ทธ๊ฐ€ ${diff}์ดˆ ($(( diff/60 ))๋ถ„) ๋™์•ˆ ์—…๋ฐ์ดํŠธ ์—†์Œ! Hang ๊ฐ€๋Šฅ์„ฑ."
fi
return 0
}
check_disk() {
local usage
usage=$(df /PROJECT 2>/dev/null | awk 'NR==2 {print $5}' | tr -d '%')
if [[ -n "$usage" && "$usage" -gt "$DISK_WARN_PCT" ]]; then
alert WARNING "๋””์Šคํฌ ์‚ฌ์šฉ๋ฅ  ${usage}% > ${DISK_WARN_PCT}%. ์ฒดํฌํฌ์ธํŠธ ์ •๋ฆฌ ํ•„์š”."
fi
return 0
}
check_gpu() {
if ! command -v nvidia-smi &>/dev/null; then return 0; fi
local low_util=0
local total_gpus=0
while IFS= read -r util; do
((total_gpus++))
if [[ "$util" -lt "$GPU_UTIL_WARN" ]]; then
((low_util++))
fi
done < <(nvidia-smi --query-gpu=utilization.gpu --format=csv,noheader,nounits 2>/dev/null)
if [[ $total_gpus -gt 0 && $low_util -gt 0 ]]; then
alert INFO "${low_util}/${total_gpus} GPU utilization < ${GPU_UTIL_WARN}%. ๋ฐ์ดํ„ฐ ๋กœ๋”ฉ ๋ณ‘๋ชฉ?"
fi
return 0
}
# ---- Status summary ---------------------------------------------------------
print_status() {
local lines
lines=$(parse_metrics 1)
if [[ -n "$lines" ]]; then
echo -e "${GREEN}์ตœ๊ทผ ๋กœ๊ทธ:${NC} $lines"
fi
if command -v nvidia-smi &>/dev/null; then
echo -e "${CYAN}GPU ๋ฉ”๋ชจ๋ฆฌ:${NC}"
nvidia-smi --query-gpu=index,memory.used,memory.total,utilization.gpu \
--format=csv,noheader 2>/dev/null | head -8
fi
local disk
disk=$(df -h /PROJECT 2>/dev/null | awk 'NR==2 {print "์‚ฌ์šฉ: "$3"/"$2" ("$5")"}')
echo -e "${CYAN}๋””์Šคํฌ:${NC} ${disk}"
}
# ---- Main loop --------------------------------------------------------------
echo "=================================================================="
echo " SFT Training Monitor"
echo " Log file: ${LOG_FILE}"
echo " Check interval: ${CHECK_INTERVAL}s"
echo " Press Ctrl+C to stop"
echo "=================================================================="
run_all_checks() {
check_loss_zero || true
check_loss_spike || true
check_gnorm || true
check_stall || true
check_disk || true
check_gpu || true
echo "---"
print_status
echo ""
}
if $CHECK_ONCE; then
run_all_checks
exit 0
fi
while true; do
run_all_checks
sleep "$CHECK_INTERVAL"
done