| set -uo pipefail | |
| if [[ $# -lt 4 ]]; then | |
| echo "usage: $0 <pid> <train_log> <checkpoint_dir> <status_log> [interval_s]" >&2 | |
| exit 1 | |
| fi | |
| PID="$1" | |
| TRAIN_LOG="$2" | |
| CHECKPOINT_DIR="$3" | |
| STATUS_LOG="$4" | |
| INTERVAL_S="${5:-60}" | |
| mkdir -p "$(dirname "$STATUS_LOG")" | |
| latest_step_line() { | |
| grep -oE 'Step [0-9]+:.*' "$TRAIN_LOG" | tail -n 1 || true | |
| } | |
| latest_gpu_line() { | |
| nvidia-smi \ | |
| --query-gpu=utilization.gpu,utilization.memory,memory.used,memory.free \ | |
| --format=csv,noheader,nounits | head -n 1 || true | |
| } | |
| latest_ckpts() { | |
| find "$CHECKPOINT_DIR" -maxdepth 1 -mindepth 1 -type d -printf '%f\n' 2>/dev/null \ | |
| | sort -n \ | |
| | tail -n 5 \ | |
| | paste -sd ',' - | |
| } | |
| printf 'watch start %s pid=%s\n' "$(date -u +%FT%TZ)" "$PID" >> "$STATUS_LOG" | |
| while kill -0 "$PID" 2>/dev/null; do | |
| STEP_LINE="$(latest_step_line || true)" | |
| GPU_LINE="$(latest_gpu_line || true)" | |
| CKPTS="$(latest_ckpts || true)" | |
| printf '%s | %s | gpu=%s | ckpts=%s\n' \ | |
| "$(date -u +%FT%TZ)" \ | |
| "${STEP_LINE:-no_step_yet}" \ | |
| "${GPU_LINE:-na}" \ | |
| "${CKPTS:-none}" >> "$STATUS_LOG" | |
| sleep "$INTERVAL_S" || true | |
| done | |
| printf 'watch end %s pid=%s\n' "$(date -u +%FT%TZ)" "$PID" >> "$STATUS_LOG" | |
| tail -n 40 "$TRAIN_LOG" >> "$STATUS_LOG" || true | |