| | #!/usr/bin/env bash |
| | |
| | |
| | |
| | |
| | |
| | set -euo pipefail |
| |
|
| | |
| | WORKDIR="${WORKDIR:-$(cd "$(dirname "$0")/.." && pwd)}" |
| | CKPT_DIR="$WORKDIR/checkpoints/korean_3b_fp8_run1" |
| | LOG_FILE="$CKPT_DIR/train.log" |
| | PID_FILE="$CKPT_DIR/train.pid" |
| | HOURLY_LOG="$CKPT_DIR/hourly_status.log" |
| | NOTIFY="python3 $WORKDIR/scripts/telegram_notify.py" |
| |
|
| | TOTAL_STEPS=57000 |
| | TOTAL_TOKENS_B=114 |
| |
|
| | |
| | ts() { date '+%Y-%m-%d %H:%M:%S'; } |
| | log() { echo "[$(ts)] $*"; } |
| |
|
| | |
| | parse_last() { |
| | local pattern="$1" |
| | grep -oP "$pattern" "$LOG_FILE" 2>/dev/null | tail -1 || echo "" |
| | } |
| |
|
| | |
| | parse_log() { |
| | if [[ ! -f "$LOG_FILE" ]]; then |
| | echo "NO_LOG" |
| | return 1 |
| | fi |
| |
|
| | |
| | LAST_LINE=$(grep -E 'step\s+[0-9]+.*loss' "$LOG_FILE" 2>/dev/null | tail -1 || echo "") |
| | if [[ -z "$LAST_LINE" ]]; then |
| | echo "NO_STEPS" |
| | return 1 |
| | fi |
| |
|
| | CURRENT_STEP=$(echo "$LAST_LINE" | grep -oP 'step\s+\K[0-9]+' || echo "0") |
| | CURRENT_LOSS=$(echo "$LAST_LINE" | grep -oP 'loss\s+\K[0-9.]+' || echo "N/A") |
| | CURRENT_LR=$(echo "$LAST_LINE" | grep -oP 'lr\s+\K[0-9.e+-]+' || echo "N/A") |
| | CURRENT_GNORM=$(echo "$LAST_LINE" | grep -oP 'gnorm\s+\K[0-9.]+' || echo "N/A") |
| | CURRENT_TOKPS=$(echo "$LAST_LINE" | grep -oP 'tok/s\s+\K[\d,]+' | tr -d ',' || echo "0") |
| | CURRENT_MEM=$(echo "$LAST_LINE" | grep -oP 'mem\s+\K[0-9.]+GB' || echo "N/A") |
| | CURRENT_EPOCH=$(echo "$LAST_LINE" | grep -oP 'epoch\s+\K[0-9]+' || echo "0") |
| |
|
| | |
| | LOG_TS=$(echo "$LAST_LINE" | grep -oP '\[\K[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}' || echo "unknown") |
| |
|
| | return 0 |
| | } |
| |
|
| | |
| | compute_eta() { |
| | local step="$1" |
| | local tokps="$2" |
| |
|
| | |
| | PROGRESS_PCT=$(echo "scale=1; $step * 100 / $TOTAL_STEPS" | bc -l 2>/dev/null || echo "0") |
| |
|
| | |
| | STEPS_LEFT=$(( TOTAL_STEPS - step )) |
| |
|
| | |
| | |
| | TOKENS_PROCESSED_B=$(echo "scale=2; $step * 1048576 / 1000000000" | bc -l 2>/dev/null || echo "0") |
| |
|
| | |
| | if [[ "$tokps" -gt 0 ]]; then |
| | |
| | local tokens_left_b |
| | tokens_left_b=$(echo "scale=2; ($TOTAL_STEPS - $step) * 1048576 / 1000000000" | bc -l 2>/dev/null || echo "0") |
| | local tokens_left |
| | tokens_left=$(echo "scale=0; ($TOTAL_STEPS - $step) * 1048576" | bc -l 2>/dev/null || echo "0") |
| | local secs_left |
| | secs_left=$(echo "scale=0; $tokens_left / $tokps" | bc -l 2>/dev/null || echo "0") |
| |
|
| | ETA_HOURS=$(echo "scale=1; $secs_left / 3600" | bc -l 2>/dev/null || echo "N/A") |
| | if [[ "$ETA_HOURS" != "N/A" ]]; then |
| | local eta_epoch |
| | eta_epoch=$(( $(date +%s) + secs_left )) |
| | ETA_DATETIME=$(date -d "@$eta_epoch" '+%m/%d %H:%M' 2>/dev/null || echo "N/A") |
| | else |
| | ETA_DATETIME="N/A" |
| | fi |
| | else |
| | ETA_HOURS="N/A" |
| | ETA_DATETIME="N/A" |
| | fi |
| | } |
| |
|
| | |
| | get_gpu_summary() { |
| | if ! command -v nvidia-smi &>/dev/null; then |
| | GPU_SUMMARY="nvidia-smi not available" |
| | GPU_AVG_UTIL="N/A" |
| | GPU_TOTAL_MEM="N/A" |
| | return |
| | fi |
| |
|
| | local raw |
| | raw=$(nvidia-smi --query-gpu=index,utilization.gpu,memory.used,memory.total \ |
| | --format=csv,noheader,nounits 2>/dev/null || echo "") |
| |
|
| | if [[ -z "$raw" ]]; then |
| | GPU_SUMMARY="GPU query failed" |
| | GPU_AVG_UTIL="N/A" |
| | GPU_TOTAL_MEM="N/A" |
| | return |
| | fi |
| |
|
| | |
| | GPU_AVG_UTIL=$(echo "$raw" | awk -F', ' '{sum+=$2; count++} END {printf "%.0f%%", sum/count}') |
| |
|
| | |
| | GPU_TOTAL_MEM=$(echo "$raw" | awk -F', ' \ |
| | '{used+=$3; total+=$4} END {printf "%.1f / %.1f GiB", used/1024, total/1024}') |
| |
|
| | |
| | GPU_SUMMARY=$(echo "$raw" | awk -F', ' \ |
| | '{printf "G%s:%s%% %sMiB | ", $1, $2, $3}' | sed 's/ | $//') |
| | } |
| |
|
| | |
| | get_disk_info() { |
| | DISK_INFO=$(df -h "$CKPT_DIR" 2>/dev/null | awk 'NR==2 {printf "%s used / %s total (%s)", $3, $2, $5}' || echo "N/A") |
| | CKPT_COUNT=$(ls -d "$CKPT_DIR"/checkpoint-* 2>/dev/null | wc -l || echo "0") |
| | LAST_CKPT=$(ls -dt "$CKPT_DIR"/checkpoint-* 2>/dev/null | head -1 | xargs basename 2>/dev/null || echo "none") |
| | } |
| |
|
| | |
| | get_process_status() { |
| | PROC_STATUS="UNKNOWN" |
| | if [[ -f "$PID_FILE" ]]; then |
| | local pid |
| | pid=$(cat "$PID_FILE" 2>/dev/null | tr -d '[:space:]') |
| | if [[ -n "$pid" ]] && kill -0 "$pid" 2>/dev/null; then |
| | PROC_STATUS="RUNNING (PID $pid)" |
| | else |
| | PROC_STATUS="STOPPED (PID $pid)" |
| | fi |
| | else |
| | PROC_STATUS="NO PID FILE" |
| | fi |
| | } |
| |
|
| | |
| | build_and_send() { |
| | local step="$CURRENT_STEP" |
| | local loss="$CURRENT_LOSS" |
| | local tokps="$CURRENT_TOKPS" |
| |
|
| | |
| | local status_icon |
| | if [[ "$PROC_STATUS" == RUNNING* ]]; then |
| | status_icon="✅" |
| | else |
| | status_icon="❌" |
| | fi |
| |
|
| | |
| | local bar_filled=$(echo "scale=0; $PROGRESS_PCT * 20 / 100" | bc -l 2>/dev/null || echo "0") |
| | local bar_empty=$(( 20 - bar_filled )) |
| | PROGRESS_BAR=$(printf '%0.s█' $(seq 1 $bar_filled 2>/dev/null) ; printf '%0.s░' $(seq 1 $bar_empty 2>/dev/null)) || PROGRESS_BAR="[$PROGRESS_PCT%]" |
| |
|
| | local msg |
| | msg="$(cat <<EOF |
| | <b>FRANKENSTALLM 3B β Hourly Status</b> |
| | <i>$(ts)</i> |
| | |
| | $status_icon <b>Process:</b> $PROC_STATUS |
| | |
| | <b>Progress</b> |
| | Step: <code>$step / $TOTAL_STEPS</code> ($PROGRESS_PCT%) |
| | Tokens: <code>${TOKENS_PROCESSED_B}B / ${TOTAL_TOKENS_B}B</code> |
| | Epoch: <code>$CURRENT_EPOCH</code> |
| | Last log: <code>$LOG_TS</code> |
| | |
| | <b>Training Metrics</b> |
| | Loss: <code>$loss</code> |
| | LR: <code>$CURRENT_LR</code> |
| | Gnorm: <code>$CURRENT_GNORM</code> |
| | Tok/s: <code>$tokps</code> |
| | Mem: <code>$CURRENT_MEM</code> |
| | |
| | <b>ETA</b> |
| | Steps left: <code>$STEPS_LEFT</code> |
| | Remaining: <code>~$ETA_HOURS h</code> |
| | Est. done: <code>$ETA_DATETIME</code> |
| | |
| | <b>GPU</b> |
| | Avg util: <code>$GPU_AVG_UTIL</code> |
| | Total mem: <code>$GPU_TOTAL_MEM</code> |
| | |
| | <b>Checkpoints</b> |
| | Last saved: <code>$LAST_CKPT</code> |
| | Total: <code>$CKPT_COUNT</code> checkpoints |
| | |
| | <b>Disk</b> |
| | <code>$DISK_INFO</code> |
| | EOF |
| | )" |
| |
|
| | log "Sending hourly status report (step $step)..." |
| | $NOTIFY "$msg" || { |
| | log "ERROR: Failed to send Telegram message." |
| | return 1 |
| | } |
| | log "Status report sent." |
| | } |
| |
|
| | |
| | main() { |
| | log "=== Hourly status START ===" |
| |
|
| | parse_log || { |
| | log "Cannot parse log β sending minimal status." |
| | $NOTIFY "<b>FRANKENSTALLM 3B</b> β Status check at $(ts) |
| | |
| | <b>WARNING:</b> Cannot read training log at: |
| | <code>$LOG_FILE</code> |
| | |
| | Process status: $(cat "$PID_FILE" 2>/dev/null && echo "(PID found)" || echo "(no PID file)")" || true |
| | return 0 |
| | } |
| |
|
| | compute_eta "$CURRENT_STEP" "$CURRENT_TOKPS" |
| | get_gpu_summary |
| | get_disk_info |
| | get_process_status |
| | build_and_send |
| |
|
| | log "=== Hourly status END ===" |
| | } |
| |
|
| | main "$@" |
| |
|