File size: 1,254 Bytes
d93804e | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 | #!/usr/bin/env bash
set -uo pipefail
if [[ $# -lt 4 ]]; then
echo "usage: $0 <pid> <train_log> <checkpoint_dir> <status_log> [interval_s]" >&2
exit 1
fi
PID="$1"
TRAIN_LOG="$2"
CHECKPOINT_DIR="$3"
STATUS_LOG="$4"
INTERVAL_S="${5:-60}"
mkdir -p "$(dirname "$STATUS_LOG")"
latest_step_line() {
grep -oE 'Step [0-9]+:.*' "$TRAIN_LOG" | tail -n 1 || true
}
latest_gpu_line() {
nvidia-smi \
--query-gpu=utilization.gpu,utilization.memory,memory.used,memory.free \
--format=csv,noheader,nounits | head -n 1 || true
}
latest_ckpts() {
find "$CHECKPOINT_DIR" -maxdepth 1 -mindepth 1 -type d -printf '%f\n' 2>/dev/null \
| sort -n \
| tail -n 5 \
| paste -sd ',' -
}
printf 'watch start %s pid=%s\n' "$(date -u +%FT%TZ)" "$PID" >> "$STATUS_LOG"
while kill -0 "$PID" 2>/dev/null; do
STEP_LINE="$(latest_step_line || true)"
GPU_LINE="$(latest_gpu_line || true)"
CKPTS="$(latest_ckpts || true)"
printf '%s | %s | gpu=%s | ckpts=%s\n' \
"$(date -u +%FT%TZ)" \
"${STEP_LINE:-no_step_yet}" \
"${GPU_LINE:-na}" \
"${CKPTS:-none}" >> "$STATUS_LOG"
sleep "$INTERVAL_S" || true
done
printf 'watch end %s pid=%s\n' "$(date -u +%FT%TZ)" "$PID" >> "$STATUS_LOG"
tail -n 40 "$TRAIN_LOG" >> "$STATUS_LOG" || true
|