File size: 1,254 Bytes
d93804e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
#!/usr/bin/env bash
set -uo pipefail

if [[ $# -lt 4 ]]; then
  echo "usage: $0 <pid> <train_log> <checkpoint_dir> <status_log> [interval_s]" >&2
  exit 1
fi

PID="$1"
TRAIN_LOG="$2"
CHECKPOINT_DIR="$3"
STATUS_LOG="$4"
INTERVAL_S="${5:-60}"

mkdir -p "$(dirname "$STATUS_LOG")"

latest_step_line() {
  grep -oE 'Step [0-9]+:.*' "$TRAIN_LOG" | tail -n 1 || true
}

latest_gpu_line() {
  nvidia-smi \
    --query-gpu=utilization.gpu,utilization.memory,memory.used,memory.free \
    --format=csv,noheader,nounits | head -n 1 || true
}

latest_ckpts() {
  find "$CHECKPOINT_DIR" -maxdepth 1 -mindepth 1 -type d -printf '%f\n' 2>/dev/null \
    | sort -n \
    | tail -n 5 \
    | paste -sd ',' -
}

printf 'watch start %s pid=%s\n' "$(date -u +%FT%TZ)" "$PID" >> "$STATUS_LOG"

while kill -0 "$PID" 2>/dev/null; do
  STEP_LINE="$(latest_step_line || true)"
  GPU_LINE="$(latest_gpu_line || true)"
  CKPTS="$(latest_ckpts || true)"
  printf '%s | %s | gpu=%s | ckpts=%s\n' \
    "$(date -u +%FT%TZ)" \
    "${STEP_LINE:-no_step_yet}" \
    "${GPU_LINE:-na}" \
    "${CKPTS:-none}" >> "$STATUS_LOG"
  sleep "$INTERVAL_S" || true
done

printf 'watch end %s pid=%s\n' "$(date -u +%FT%TZ)" "$PID" >> "$STATUS_LOG"
tail -n 40 "$TRAIN_LOG" >> "$STATUS_LOG" || true