| #!/usr/bin/env bash |
| |
| |
| |
| |
| |
| set -euo pipefail |
|
|
| |
| WORKDIR="${WORKDIR:-$(cd "$(dirname "$0")/.." && pwd)}" |
| CKPT_DIR="$WORKDIR/checkpoints/korean_3b_fp8_run1" |
| LOG_FILE="$CKPT_DIR/train.log" |
| PID_FILE="$CKPT_DIR/train.pid" |
| HOURLY_LOG="$CKPT_DIR/hourly_status.log" |
| NOTIFY="python3 $WORKDIR/scripts/telegram_notify.py" |
|
|
| TOTAL_STEPS=57000 |
| TOTAL_TOKENS_B=114 |
|
|
| |
| ts() { date '+%Y-%m-%d %H:%M:%S'; } |
| log() { echo "[$(ts)] $*"; } |
|
|
| |
| parse_last() { |
| local pattern="$1" |
| grep -oP "$pattern" "$LOG_FILE" 2>/dev/null | tail -1 || echo "" |
| } |
|
|
| |
| parse_log() { |
| if [[ ! -f "$LOG_FILE" ]]; then |
| echo "NO_LOG" |
| return 1 |
| fi |
|
|
| |
| LAST_LINE=$(grep -E 'step\s+[0-9]+.*loss' "$LOG_FILE" 2>/dev/null | tail -1 || echo "") |
| if [[ -z "$LAST_LINE" ]]; then |
| echo "NO_STEPS" |
| return 1 |
| fi |
|
|
| CURRENT_STEP=$(echo "$LAST_LINE" | grep -oP 'step\s+\K[0-9]+' || echo "0") |
| CURRENT_LOSS=$(echo "$LAST_LINE" | grep -oP 'loss\s+\K[0-9.]+' || echo "N/A") |
| CURRENT_LR=$(echo "$LAST_LINE" | grep -oP 'lr\s+\K[0-9.e+-]+' || echo "N/A") |
| CURRENT_GNORM=$(echo "$LAST_LINE" | grep -oP 'gnorm\s+\K[0-9.]+' || echo "N/A") |
| CURRENT_TOKPS=$(echo "$LAST_LINE" | grep -oP 'tok/s\s+\K[\d,]+' | tr -d ',' || echo "0") |
| CURRENT_MEM=$(echo "$LAST_LINE" | grep -oP 'mem\s+\K[0-9.]+GB' || echo "N/A") |
| CURRENT_EPOCH=$(echo "$LAST_LINE" | grep -oP 'epoch\s+\K[0-9]+' || echo "0") |
|
|
| |
| LOG_TS=$(echo "$LAST_LINE" | grep -oP '\[\K[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}' || echo "unknown") |
|
|
| return 0 |
| } |
|
|
| |
| compute_eta() { |
| local step="$1" |
| local tokps="$2" |
|
|
| |
| PROGRESS_PCT=$(echo "scale=1; $step * 100 / $TOTAL_STEPS" | bc -l 2>/dev/null || echo "0") |
|
|
| |
| STEPS_LEFT=$(( TOTAL_STEPS - step )) |
|
|
| |
| |
| TOKENS_PROCESSED_B=$(echo "scale=2; $step * 1048576 / 1000000000" | bc -l 2>/dev/null || echo "0") |
|
|
| |
| if [[ "$tokps" -gt 0 ]]; then |
| |
| local tokens_left_b |
| tokens_left_b=$(echo "scale=2; ($TOTAL_STEPS - $step) * 1048576 / 1000000000" | bc -l 2>/dev/null || echo "0") |
| local tokens_left |
| tokens_left=$(echo "scale=0; ($TOTAL_STEPS - $step) * 1048576" | bc -l 2>/dev/null || echo "0") |
| local secs_left |
| secs_left=$(echo "scale=0; $tokens_left / $tokps" | bc -l 2>/dev/null || echo "0") |
|
|
| ETA_HOURS=$(echo "scale=1; $secs_left / 3600" | bc -l 2>/dev/null || echo "N/A") |
| if [[ "$ETA_HOURS" != "N/A" ]]; then |
| local eta_epoch |
| eta_epoch=$(( $(date +%s) + secs_left )) |
| ETA_DATETIME=$(date -d "@$eta_epoch" '+%m/%d %H:%M' 2>/dev/null || echo "N/A") |
| else |
| ETA_DATETIME="N/A" |
| fi |
| else |
| ETA_HOURS="N/A" |
| ETA_DATETIME="N/A" |
| fi |
| } |
|
|
| |
| get_gpu_summary() { |
| if ! command -v nvidia-smi &>/dev/null; then |
| GPU_SUMMARY="nvidia-smi not available" |
| GPU_AVG_UTIL="N/A" |
| GPU_TOTAL_MEM="N/A" |
| return |
| fi |
|
|
| local raw |
| raw=$(nvidia-smi --query-gpu=index,utilization.gpu,memory.used,memory.total \ |
| --format=csv,noheader,nounits 2>/dev/null || echo "") |
|
|
| if [[ -z "$raw" ]]; then |
| GPU_SUMMARY="GPU query failed" |
| GPU_AVG_UTIL="N/A" |
| GPU_TOTAL_MEM="N/A" |
| return |
| fi |
|
|
| |
| GPU_AVG_UTIL=$(echo "$raw" | awk -F', ' '{sum+=$2; count++} END {printf "%.0f%%", sum/count}') |
|
|
| |
| GPU_TOTAL_MEM=$(echo "$raw" | awk -F', ' \ |
| '{used+=$3; total+=$4} END {printf "%.1f / %.1f GiB", used/1024, total/1024}') |
|
|
| |
| GPU_SUMMARY=$(echo "$raw" | awk -F', ' \ |
| '{printf "G%s:%s%% %sMiB | ", $1, $2, $3}' | sed 's/ | $//') |
| } |
|
|
| |
| get_disk_info() { |
| DISK_INFO=$(df -h "$CKPT_DIR" 2>/dev/null | awk 'NR==2 {printf "%s used / %s total (%s)", $3, $2, $5}' || echo "N/A") |
| CKPT_COUNT=$(ls -d "$CKPT_DIR"/checkpoint-* 2>/dev/null | wc -l || echo "0") |
| LAST_CKPT=$(ls -dt "$CKPT_DIR"/checkpoint-* 2>/dev/null | head -1 | xargs basename 2>/dev/null || echo "none") |
| } |
|
|
| |
| get_process_status() { |
| PROC_STATUS="UNKNOWN" |
| if [[ -f "$PID_FILE" ]]; then |
| local pid |
| pid=$(cat "$PID_FILE" 2>/dev/null | tr -d '[:space:]') |
| if [[ -n "$pid" ]] && kill -0 "$pid" 2>/dev/null; then |
| PROC_STATUS="RUNNING (PID $pid)" |
| else |
| PROC_STATUS="STOPPED (PID $pid)" |
| fi |
| else |
| PROC_STATUS="NO PID FILE" |
| fi |
| } |
|
|
| |
| build_and_send() { |
| local step="$CURRENT_STEP" |
| local loss="$CURRENT_LOSS" |
| local tokps="$CURRENT_TOKPS" |
|
|
| |
| local status_icon |
| if [[ "$PROC_STATUS" == RUNNING* ]]; then |
| status_icon="✅" |
| else |
| status_icon="❌" |
| fi |
|
|
| |
| local bar_filled=$(echo "scale=0; $PROGRESS_PCT * 20 / 100" | bc -l 2>/dev/null || echo "0") |
| local bar_empty=$(( 20 - bar_filled )) |
| PROGRESS_BAR=$(printf '%0.s█' $(seq 1 $bar_filled 2>/dev/null) ; printf '%0.s░' $(seq 1 $bar_empty 2>/dev/null)) || PROGRESS_BAR="[$PROGRESS_PCT%]" |
|
|
| local msg |
| msg="$(cat <<EOF |
| <b>FRANKENSTALLM 3B β Hourly Status</b> |
| <i>$(ts)</i> |
| |
| $status_icon <b>Process:</b> $PROC_STATUS |
| |
| <b>Progress</b> |
| Step: <code>$step / $TOTAL_STEPS</code> ($PROGRESS_PCT%) |
| Tokens: <code>${TOKENS_PROCESSED_B}B / ${TOTAL_TOKENS_B}B</code> |
| Epoch: <code>$CURRENT_EPOCH</code> |
| Last log: <code>$LOG_TS</code> |
| |
| <b>Training Metrics</b> |
| Loss: <code>$loss</code> |
| LR: <code>$CURRENT_LR</code> |
| Gnorm: <code>$CURRENT_GNORM</code> |
| Tok/s: <code>$tokps</code> |
| Mem: <code>$CURRENT_MEM</code> |
| |
| <b>ETA</b> |
| Steps left: <code>$STEPS_LEFT</code> |
| Remaining: <code>~$ETA_HOURS h</code> |
| Est. done: <code>$ETA_DATETIME</code> |
| |
| <b>GPU</b> |
| Avg util: <code>$GPU_AVG_UTIL</code> |
| Total mem: <code>$GPU_TOTAL_MEM</code> |
| |
| <b>Checkpoints</b> |
| Last saved: <code>$LAST_CKPT</code> |
| Total: <code>$CKPT_COUNT</code> checkpoints |
| |
| <b>Disk</b> |
| <code>$DISK_INFO</code> |
| EOF |
| )" |
|
|
| log "Sending hourly status report (step $step)..." |
| $NOTIFY "$msg" || { |
| log "ERROR: Failed to send Telegram message." |
| return 1 |
| } |
| log "Status report sent." |
| } |
|
|
| |
| main() { |
| log "=== Hourly status START ===" |
|
|
| parse_log || { |
| log "Cannot parse log β sending minimal status." |
| $NOTIFY "<b>FRANKENSTALLM 3B</b> β Status check at $(ts) |
| |
| <b>WARNING:</b> Cannot read training log at: |
| <code>$LOG_FILE</code> |
| |
| Process status: $(cat "$PID_FILE" 2>/dev/null && echo "(PID found)" || echo "(no PID file)")" || true |
| return 0 |
| } |
|
|
| compute_eta "$CURRENT_STEP" "$CURRENT_TOKPS" |
| get_gpu_summary |
| get_disk_info |
| get_process_status |
| build_and_send |
|
|
| log "=== Hourly status END ===" |
| } |
|
|
| main "$@" |
|
|