CausalGrok / code /scripts /monitor_all.sh
nileshsarkar-ai's picture
Upload code/scripts
42c0d23 verified
#!/usr/bin/env bash
# Live status of every causalgrok_camelyon_v2 process and run dir.
# Survives SSH and is safe to call repeatedly.
#
# Usage:
# bash scripts/monitor_all.sh # one-shot snapshot
# watch -n 30 bash scripts/monitor_all.sh # auto-refresh every 30s
# don't fail the whole script on any single subcommand error β€”
# we want a tolerant dashboard, not a strict pipeline
set +e
ROOT="$(cd "$(dirname "$0")/.." && pwd)"
cd "${ROOT}"
bold() { printf "\033[1m%s\033[0m" "$*"; }
green() { printf "\033[32m%s\033[0m" "$*"; }
red() { printf "\033[31m%s\033[0m" "$*"; }
gray() { printf "\033[90m%s\033[0m" "$*"; }
stamp() { date -u +%FT%TZ; }
echo "═══════════════════════════════════════════════════════════════════════════════"
echo " CausalGrok β€” all-runs monitor $(stamp)"
echo "═══════════════════════════════════════════════════════════════════════════════"
# 1. GPU snapshot
echo
echo "$(bold "GPU")"
nvidia-smi --query-gpu=memory.used,memory.free,memory.total,utilization.gpu,temperature.gpu \
--format=csv,noheader,nounits | \
awk -F',' '{printf " used=%sMB free=%sMB total=%sMB util=%s%% temp=%sΒ°C\n", $1, $2, $3, $4, $5}'
# 2. All causalgrok_camelyon_v2 processes
PIDS=$(pgrep -f "causalgrok_camelyon_v2" || true)
N_ACTIVE=$(echo "$PIDS" | grep -c . || true)
echo
echo "$(bold "Active training processes: ${N_ACTIVE}")"
if [ -z "$PIDS" ]; then
echo " $(gray "(none)")"
else
printf " %-9s %-12s %-8s %-6s %-6s %s\n" "PID" "ELAPSED" "DETACHED" "%CPU" "%MEM" "RUN_ID"
for pid in $PIDS; do
ppid=$(ps -o ppid= -p $pid 2>/dev/null | tr -d ' ' || echo "?")
etime=$(ps -o etime= -p $pid 2>/dev/null | tr -d ' ' || echo "?")
pcpu=$(ps -o pcpu= -p $pid 2>/dev/null | tr -d ' ' || echo "?")
pmem=$(ps -o pmem= -p $pid 2>/dev/null | tr -d ' ' || echo "?")
rid=$(ps -o cmd= -p $pid 2>/dev/null | grep -oP 'experiments/runs/\K[^ ]+' || echo "?")
if [ "$ppid" = "1" ]; then
detached="$(green "yes")"
else
detached="$(red "PPID=$ppid")"
fi
printf " %-9s %-12s %-17s %-6s %-6s %s\n" "$pid" "$etime" "$detached" "$pcpu" "$pmem" "$rid"
done
fi
# 3. Per-run progress (epoch, latest OOD, periodic ckpts)
echo
echo "$(bold "Per-run progress")"
echo " (epoch ← latest train.log; ckpts ← ep*.pt count; best_ood ← latest history.json)"
echo
printf " %-46s %-7s %-9s %-8s %-8s %s\n" "RUN_ID" "EPOCH" "% DONE" "CKPTS" "BEST_OOD" "LATEST"
for d in experiments/runs/*/; do
rid=$(basename "$d")
log="$d/logs/train.log"
[ -f "$log" ] || continue
# only show recent (May 5) or running ones
pid_in_dir=""
if [ -f "$d/run.pid" ]; then
pid_in_dir=$(cat "$d/run.pid" 2>/dev/null | tr -d ' ')
fi
is_active="no"
if [ -n "$pid_in_dir" ] && kill -0 "$pid_in_dir" 2>/dev/null; then
is_active="yes"
fi
if [ "$is_active" = "no" ] && [[ "$rid" != 20260505-* ]]; then
continue
fi
# latest epoch from train.log
last_ep=$(grep -oP "ep\s+\K[0-9]+" "$log" 2>/dev/null | tail -1 || echo "0")
last_ep=${last_ep:-0}
# total epochs from config or log header
total_ep=$(grep -oP "Camelyon17 v2 \| \K[0-9]+" "$log" 2>/dev/null | head -1 || echo "?")
if [ -n "$total_ep" ] && [ "$total_ep" != "?" ] && [ "$last_ep" -gt 0 ]; then
pct=$(awk "BEGIN{printf \"%.0f\", ($last_ep/$total_ep)*100}")
pct="${pct}%"
else
pct="?"
fi
# ckpt count
ckpts=$(ls "$d/checkpoints/"ep*.pt 2>/dev/null | wc -l)
# best ood from history.json (max of ood_acc field)
best_ood="?"
if [ -f "$d/results/history.json" ]; then
best_ood=$(python3 -c "
import json, sys
try:
h = json.load(open('$d/results/history.json'))
oods = [r.get('ood_acc', 0) for r in h if isinstance(r.get('ood_acc'), (int, float))]
print(f'{max(oods):.3f}' if oods else '?')
except: print('?')
" 2>/dev/null)
fi
# latest line condensed
latest=$(grep -E "ep\s+[0-9]+ \| tr" "$log" 2>/dev/null | tail -1 | \
sed -E 's/^.*ep\s+[0-9]+ \|//; s/\| β€–Wβ€–.*//' | tr -s ' ' || echo "")
state_icon="$(green "●")"
if [ "$is_active" = "no" ]; then
if [ "$last_ep" = "$total_ep" ]; then
state_icon="$(green "βœ“")"
else
state_icon="$(red "βœ—")"
fi
fi
printf " %s %-44s %-7s %-9s %-8s %-8s %s\n" \
"$state_icon" "$rid" "$last_ep" "$pct" "$ckpts" "$best_ood" "${latest:0:60}"
done
# 4. Disk pressure
echo
echo "$(bold "Disk usage on experiments/runs")"
du -sh experiments/runs/ 2>/dev/null | awk '{print " " $1 " (" $2 ")"}'
echo
echo "$(gray "Re-run with: watch -n 30 bash scripts/monitor_all.sh")"