#!/usr/bin/env bash set -uo pipefail if [[ $# -lt 4 ]]; then echo "usage: $0 [interval_s]" >&2 exit 1 fi PID="$1" TRAIN_LOG="$2" CHECKPOINT_DIR="$3" STATUS_LOG="$4" INTERVAL_S="${5:-60}" mkdir -p "$(dirname "$STATUS_LOG")" latest_step_line() { grep -oE 'Step [0-9]+:.*' "$TRAIN_LOG" | tail -n 1 || true } latest_gpu_line() { nvidia-smi \ --query-gpu=utilization.gpu,utilization.memory,memory.used,memory.free \ --format=csv,noheader,nounits | head -n 1 || true } latest_ckpts() { find "$CHECKPOINT_DIR" -maxdepth 1 -mindepth 1 -type d -printf '%f\n' 2>/dev/null \ | sort -n \ | tail -n 5 \ | paste -sd ',' - } printf 'watch start %s pid=%s\n' "$(date -u +%FT%TZ)" "$PID" >> "$STATUS_LOG" while kill -0 "$PID" 2>/dev/null; do STEP_LINE="$(latest_step_line || true)" GPU_LINE="$(latest_gpu_line || true)" CKPTS="$(latest_ckpts || true)" printf '%s | %s | gpu=%s | ckpts=%s\n' \ "$(date -u +%FT%TZ)" \ "${STEP_LINE:-no_step_yet}" \ "${GPU_LINE:-na}" \ "${CKPTS:-none}" >> "$STATUS_LOG" sleep "$INTERVAL_S" || true done printf 'watch end %s pid=%s\n' "$(date -u +%FT%TZ)" "$PID" >> "$STATUS_LOG" tail -n 40 "$TRAIN_LOG" >> "$STATUS_LOG" || true