File size: 5,079 Bytes
42c0d23 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 | #!/usr/bin/env bash
# Live status of every causalgrok_camelyon_v2 process and run dir.
# Survives SSH and is safe to call repeatedly.
#
# Usage:
# bash scripts/monitor_all.sh # one-shot snapshot
# watch -n 30 bash scripts/monitor_all.sh # auto-refresh every 30s
# don't fail the whole script on any single subcommand error β
# we want a tolerant dashboard, not a strict pipeline
set +e
ROOT="$(cd "$(dirname "$0")/.." && pwd)"
cd "${ROOT}"
bold() { printf "\033[1m%s\033[0m" "$*"; }
green() { printf "\033[32m%s\033[0m" "$*"; }
red() { printf "\033[31m%s\033[0m" "$*"; }
gray() { printf "\033[90m%s\033[0m" "$*"; }
stamp() { date -u +%FT%TZ; }
echo "βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ"
echo " CausalGrok β all-runs monitor $(stamp)"
echo "βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ"
# 1. GPU snapshot
echo
echo "$(bold "GPU")"
nvidia-smi --query-gpu=memory.used,memory.free,memory.total,utilization.gpu,temperature.gpu \
--format=csv,noheader,nounits | \
awk -F',' '{printf " used=%sMB free=%sMB total=%sMB util=%s%% temp=%sΒ°C\n", $1, $2, $3, $4, $5}'
# 2. All causalgrok_camelyon_v2 processes
PIDS=$(pgrep -f "causalgrok_camelyon_v2" || true)
N_ACTIVE=$(echo "$PIDS" | grep -c . || true)
echo
echo "$(bold "Active training processes: ${N_ACTIVE}")"
if [ -z "$PIDS" ]; then
echo " $(gray "(none)")"
else
printf " %-9s %-12s %-8s %-6s %-6s %s\n" "PID" "ELAPSED" "DETACHED" "%CPU" "%MEM" "RUN_ID"
for pid in $PIDS; do
ppid=$(ps -o ppid= -p $pid 2>/dev/null | tr -d ' ' || echo "?")
etime=$(ps -o etime= -p $pid 2>/dev/null | tr -d ' ' || echo "?")
pcpu=$(ps -o pcpu= -p $pid 2>/dev/null | tr -d ' ' || echo "?")
pmem=$(ps -o pmem= -p $pid 2>/dev/null | tr -d ' ' || echo "?")
rid=$(ps -o cmd= -p $pid 2>/dev/null | grep -oP 'experiments/runs/\K[^ ]+' || echo "?")
if [ "$ppid" = "1" ]; then
detached="$(green "yes")"
else
detached="$(red "PPID=$ppid")"
fi
printf " %-9s %-12s %-17s %-6s %-6s %s\n" "$pid" "$etime" "$detached" "$pcpu" "$pmem" "$rid"
done
fi
# 3. Per-run progress (epoch, latest OOD, periodic ckpts)
echo
echo "$(bold "Per-run progress")"
echo " (epoch β latest train.log; ckpts β ep*.pt count; best_ood β latest history.json)"
echo
printf " %-46s %-7s %-9s %-8s %-8s %s\n" "RUN_ID" "EPOCH" "% DONE" "CKPTS" "BEST_OOD" "LATEST"
for d in experiments/runs/*/; do
rid=$(basename "$d")
log="$d/logs/train.log"
[ -f "$log" ] || continue
# only show recent (May 5) or running ones
pid_in_dir=""
if [ -f "$d/run.pid" ]; then
pid_in_dir=$(cat "$d/run.pid" 2>/dev/null | tr -d ' ')
fi
is_active="no"
if [ -n "$pid_in_dir" ] && kill -0 "$pid_in_dir" 2>/dev/null; then
is_active="yes"
fi
if [ "$is_active" = "no" ] && [[ "$rid" != 20260505-* ]]; then
continue
fi
# latest epoch from train.log
last_ep=$(grep -oP "ep\s+\K[0-9]+" "$log" 2>/dev/null | tail -1 || echo "0")
last_ep=${last_ep:-0}
# total epochs from config or log header
total_ep=$(grep -oP "Camelyon17 v2 \| \K[0-9]+" "$log" 2>/dev/null | head -1 || echo "?")
if [ -n "$total_ep" ] && [ "$total_ep" != "?" ] && [ "$last_ep" -gt 0 ]; then
pct=$(awk "BEGIN{printf \"%.0f\", ($last_ep/$total_ep)*100}")
pct="${pct}%"
else
pct="?"
fi
# ckpt count
ckpts=$(ls "$d/checkpoints/"ep*.pt 2>/dev/null | wc -l)
# best ood from history.json (max of ood_acc field)
best_ood="?"
if [ -f "$d/results/history.json" ]; then
best_ood=$(python3 -c "
import json, sys
try:
h = json.load(open('$d/results/history.json'))
oods = [r.get('ood_acc', 0) for r in h if isinstance(r.get('ood_acc'), (int, float))]
print(f'{max(oods):.3f}' if oods else '?')
except: print('?')
" 2>/dev/null)
fi
# latest line condensed
latest=$(grep -E "ep\s+[0-9]+ \| tr" "$log" 2>/dev/null | tail -1 | \
sed -E 's/^.*ep\s+[0-9]+ \|//; s/\| βWβ.*//' | tr -s ' ' || echo "")
state_icon="$(green "β")"
if [ "$is_active" = "no" ]; then
if [ "$last_ep" = "$total_ep" ]; then
state_icon="$(green "β")"
else
state_icon="$(red "β")"
fi
fi
printf " %s %-44s %-7s %-9s %-8s %-8s %s\n" \
"$state_icon" "$rid" "$last_ep" "$pct" "$ckpts" "$best_ood" "${latest:0:60}"
done
# 4. Disk pressure
echo
echo "$(bold "Disk usage on experiments/runs")"
du -sh experiments/runs/ 2>/dev/null | awk '{print " " $1 " (" $2 ")"}'
echo
echo "$(gray "Re-run with: watch -n 30 bash scripts/monitor_all.sh")"
|