File size: 9,306 Bytes
48ecd01
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
#!/usr/bin/env bash
# =============================================================================
# hourly_status.sh β€” FRANKENSTALLM 3B Hourly Training Status Report (Telegram)
# Run: every hour via cron
# Sends a rich formatted message with progress, loss, ETA, GPU/disk summary.
# =============================================================================
set -euo pipefail

# ─── Paths ───────────────────────────────────────────────────────────────────
WORKDIR="${WORKDIR:-$(cd "$(dirname "$0")/.." && pwd)}"
CKPT_DIR="$WORKDIR/checkpoints/korean_3b_fp8_run1"
LOG_FILE="$CKPT_DIR/train.log"
PID_FILE="$CKPT_DIR/train.pid"
HOURLY_LOG="$CKPT_DIR/hourly_status.log"
NOTIFY="python3 $WORKDIR/scripts/telegram_notify.py"

TOTAL_STEPS=57000
TOTAL_TOKENS_B=114   # billion tokens target (57K steps Γ— batch)

# ─── Helpers ─────────────────────────────────────────────────────────────────
ts()    { date '+%Y-%m-%d %H:%M:%S'; }
log()   { echo "[$(ts)] $*"; }

# Safely get last matching value from log
parse_last() {
    local pattern="$1"
    grep -oP "$pattern" "$LOG_FILE" 2>/dev/null | tail -1 || echo ""
}

# ─── Parse training log ───────────────────────────────────────────────────────
parse_log() {
    if [[ ! -f "$LOG_FILE" ]]; then
        echo "NO_LOG"
        return 1
    fi

    # Get the last step line
    LAST_LINE=$(grep -E 'step\s+[0-9]+.*loss' "$LOG_FILE" 2>/dev/null | tail -1 || echo "")
    if [[ -z "$LAST_LINE" ]]; then
        echo "NO_STEPS"
        return 1
    fi

    CURRENT_STEP=$(echo "$LAST_LINE" | grep -oP 'step\s+\K[0-9]+' || echo "0")
    CURRENT_LOSS=$(echo "$LAST_LINE" | grep -oP 'loss\s+\K[0-9.]+' || echo "N/A")
    CURRENT_LR=$(echo "$LAST_LINE" | grep -oP 'lr\s+\K[0-9.e+-]+' || echo "N/A")
    CURRENT_GNORM=$(echo "$LAST_LINE" | grep -oP 'gnorm\s+\K[0-9.]+' || echo "N/A")
    CURRENT_TOKPS=$(echo "$LAST_LINE" | grep -oP 'tok/s\s+\K[\d,]+' | tr -d ',' || echo "0")
    CURRENT_MEM=$(echo "$LAST_LINE" | grep -oP 'mem\s+\K[0-9.]+GB' || echo "N/A")
    CURRENT_EPOCH=$(echo "$LAST_LINE" | grep -oP 'epoch\s+\K[0-9]+' || echo "0")

    # Log timestamp β€” parse from the line itself
    LOG_TS=$(echo "$LAST_LINE" | grep -oP '\[\K[0-9]{4}-[0-9]{2}-[0-9]{2} [0-9]{2}:[0-9]{2}:[0-9]{2}' || echo "unknown")

    return 0
}

# ─── Calculate progress & ETA ─────────────────────────────────────────────────
compute_eta() {
    local step="$1"
    local tokps="$2"

    # Progress
    PROGRESS_PCT=$(echo "scale=1; $step * 100 / $TOTAL_STEPS" | bc -l 2>/dev/null || echo "0")

    # Steps remaining
    STEPS_LEFT=$(( TOTAL_STEPS - step ))

    # Tokens processed so far (approx: step Γ— 2M tokens/step for 3B, bs=4, seqlen=4096, 8gpu)
    # bs=4, accum=8, 8gpu β†’ effective batch = 4*8*8=256 sequences Γ— 4096 tokens = 1,048,576 β‰ˆ 1M tok/step
    TOKENS_PROCESSED_B=$(echo "scale=2; $step * 1048576 / 1000000000" | bc -l 2>/dev/null || echo "0")

    # ETA using current tok/s
    if [[ "$tokps" -gt 0 ]]; then
        # tokens remaining
        local tokens_left_b
        tokens_left_b=$(echo "scale=2; ($TOTAL_STEPS - $step) * 1048576 / 1000000000" | bc -l 2>/dev/null || echo "0")
        local tokens_left
        tokens_left=$(echo "scale=0; ($TOTAL_STEPS - $step) * 1048576" | bc -l 2>/dev/null || echo "0")
        local secs_left
        secs_left=$(echo "scale=0; $tokens_left / $tokps" | bc -l 2>/dev/null || echo "0")

        ETA_HOURS=$(echo "scale=1; $secs_left / 3600" | bc -l 2>/dev/null || echo "N/A")
        if [[ "$ETA_HOURS" != "N/A" ]]; then
            local eta_epoch
            eta_epoch=$(( $(date +%s) + secs_left ))
            ETA_DATETIME=$(date -d "@$eta_epoch" '+%m/%d %H:%M' 2>/dev/null || echo "N/A")
        else
            ETA_DATETIME="N/A"
        fi
    else
        ETA_HOURS="N/A"
        ETA_DATETIME="N/A"
    fi
}

# ─── GPU summary ─────────────────────────────────────────────────────────────
get_gpu_summary() {
    if ! command -v nvidia-smi &>/dev/null; then
        GPU_SUMMARY="nvidia-smi not available"
        GPU_AVG_UTIL="N/A"
        GPU_TOTAL_MEM="N/A"
        return
    fi

    local raw
    raw=$(nvidia-smi --query-gpu=index,utilization.gpu,memory.used,memory.total \
        --format=csv,noheader,nounits 2>/dev/null || echo "")

    if [[ -z "$raw" ]]; then
        GPU_SUMMARY="GPU query failed"
        GPU_AVG_UTIL="N/A"
        GPU_TOTAL_MEM="N/A"
        return
    fi

    # avg util
    GPU_AVG_UTIL=$(echo "$raw" | awk -F', ' '{sum+=$2; count++} END {printf "%.0f%%", sum/count}')

    # total mem used / total
    GPU_TOTAL_MEM=$(echo "$raw" | awk -F', ' \
        '{used+=$3; total+=$4} END {printf "%.1f / %.1f GiB", used/1024, total/1024}')

    # Per-GPU one-liner: "G0:95% 48G | G1:94% 48G | ..."
    GPU_SUMMARY=$(echo "$raw" | awk -F', ' \
        '{printf "G%s:%s%% %sMiB | ", $1, $2, $3}' | sed 's/ | $//')
}

# ─── Disk usage ──────────────────────────────────────────────────────────────
get_disk_info() {
    DISK_INFO=$(df -h "$CKPT_DIR" 2>/dev/null | awk 'NR==2 {printf "%s used / %s total (%s)", $3, $2, $5}' || echo "N/A")
    CKPT_COUNT=$(ls -d "$CKPT_DIR"/checkpoint-* 2>/dev/null | wc -l || echo "0")
    LAST_CKPT=$(ls -dt "$CKPT_DIR"/checkpoint-* 2>/dev/null | head -1 | xargs basename 2>/dev/null || echo "none")
}

# ─── Process status ───────────────────────────────────────────────────────────
get_process_status() {
    PROC_STATUS="UNKNOWN"
    if [[ -f "$PID_FILE" ]]; then
        local pid
        pid=$(cat "$PID_FILE" 2>/dev/null | tr -d '[:space:]')
        if [[ -n "$pid" ]] && kill -0 "$pid" 2>/dev/null; then
            PROC_STATUS="RUNNING (PID $pid)"
        else
            PROC_STATUS="STOPPED (PID $pid)"
        fi
    else
        PROC_STATUS="NO PID FILE"
    fi
}

# ─── Build & send message ────────────────────────────────────────────────────
build_and_send() {
    local step="$CURRENT_STEP"
    local loss="$CURRENT_LOSS"
    local tokps="$CURRENT_TOKPS"

    # Status icon
    local status_icon
    if [[ "$PROC_STATUS" == RUNNING* ]]; then
        status_icon="✅"    # green check
    else
        status_icon="❌"   # red X
    fi

    # Progress bar (20 chars)
    local bar_filled=$(echo "scale=0; $PROGRESS_PCT * 20 / 100" | bc -l 2>/dev/null || echo "0")
    local bar_empty=$(( 20 - bar_filled ))
    PROGRESS_BAR=$(printf '%0.s█' $(seq 1 $bar_filled 2>/dev/null) ; printf '%0.s░' $(seq 1 $bar_empty 2>/dev/null)) || PROGRESS_BAR="[$PROGRESS_PCT%]"

    local msg
    msg="$(cat <<EOF
<b>FRANKENSTALLM 3B β€” Hourly Status</b>
<i>$(ts)</i>

$status_icon <b>Process:</b> $PROC_STATUS

<b>Progress</b>
Step: <code>$step / $TOTAL_STEPS</code>  ($PROGRESS_PCT%)
Tokens: <code>${TOKENS_PROCESSED_B}B / ${TOTAL_TOKENS_B}B</code>
Epoch: <code>$CURRENT_EPOCH</code>
Last log: <code>$LOG_TS</code>

<b>Training Metrics</b>
Loss:   <code>$loss</code>
LR:     <code>$CURRENT_LR</code>
Gnorm:  <code>$CURRENT_GNORM</code>
Tok/s:  <code>$tokps</code>
Mem:    <code>$CURRENT_MEM</code>

<b>ETA</b>
Steps left: <code>$STEPS_LEFT</code>
Remaining:  <code>~$ETA_HOURS h</code>
Est. done:  <code>$ETA_DATETIME</code>

<b>GPU</b>
Avg util: <code>$GPU_AVG_UTIL</code>
Total mem: <code>$GPU_TOTAL_MEM</code>

<b>Checkpoints</b>
Last saved: <code>$LAST_CKPT</code>
Total: <code>$CKPT_COUNT</code> checkpoints

<b>Disk</b>
<code>$DISK_INFO</code>
EOF
)"

    log "Sending hourly status report (step $step)..."
    $NOTIFY "$msg" || {
        log "ERROR: Failed to send Telegram message."
        return 1
    }
    log "Status report sent."
}

# ─── Main ────────────────────────────────────────────────────────────────────
main() {
    log "=== Hourly status START ==="

    parse_log || {
        log "Cannot parse log β€” sending minimal status."
        $NOTIFY "<b>FRANKENSTALLM 3B</b> β€” Status check at $(ts)

<b>WARNING:</b> Cannot read training log at:
<code>$LOG_FILE</code>

Process status: $(cat "$PID_FILE" 2>/dev/null && echo "(PID found)" || echo "(no PID file)")" || true
        return 0
    }

    compute_eta "$CURRENT_STEP" "$CURRENT_TOKPS"
    get_gpu_summary
    get_disk_info
    get_process_status
    build_and_send

    log "=== Hourly status END ==="
}

main "$@"