| #!/usr/bin/env bash |
| |
| |
| |
| |
| |
| |
|
|
| |
| |
| set +e |
|
|
| ROOT="$(cd "$(dirname "$0")/.." && pwd)" |
| cd "${ROOT}" |
|
|
| bold() { printf "\033[1m%s\033[0m" "$*"; } |
| green() { printf "\033[32m%s\033[0m" "$*"; } |
| red() { printf "\033[31m%s\033[0m" "$*"; } |
| gray() { printf "\033[90m%s\033[0m" "$*"; } |
|
|
| stamp() { date -u +%FT%TZ; } |
|
|
| echo "βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ" |
| echo " CausalGrok β all-runs monitor $(stamp)" |
| echo "βββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββββ" |
|
|
| |
| echo |
| echo "$(bold "GPU")" |
| nvidia-smi --query-gpu=memory.used,memory.free,memory.total,utilization.gpu,temperature.gpu \ |
| --format=csv,noheader,nounits | \ |
| awk -F',' '{printf " used=%sMB free=%sMB total=%sMB util=%s%% temp=%sΒ°C\n", $1, $2, $3, $4, $5}' |
|
|
| |
| PIDS=$(pgrep -f "causalgrok_camelyon_v2" || true) |
| N_ACTIVE=$(echo "$PIDS" | grep -c . || true) |
|
|
| echo |
| echo "$(bold "Active training processes: ${N_ACTIVE}")" |
| if [ -z "$PIDS" ]; then |
| echo " $(gray "(none)")" |
| else |
| printf " %-9s %-12s %-8s %-6s %-6s %s\n" "PID" "ELAPSED" "DETACHED" "%CPU" "%MEM" "RUN_ID" |
| for pid in $PIDS; do |
| ppid=$(ps -o ppid= -p $pid 2>/dev/null | tr -d ' ' || echo "?") |
| etime=$(ps -o etime= -p $pid 2>/dev/null | tr -d ' ' || echo "?") |
| pcpu=$(ps -o pcpu= -p $pid 2>/dev/null | tr -d ' ' || echo "?") |
| pmem=$(ps -o pmem= -p $pid 2>/dev/null | tr -d ' ' || echo "?") |
| rid=$(ps -o cmd= -p $pid 2>/dev/null | grep -oP 'experiments/runs/\K[^ ]+' || echo "?") |
| if [ "$ppid" = "1" ]; then |
| detached="$(green "yes")" |
| else |
| detached="$(red "PPID=$ppid")" |
| fi |
| printf " %-9s %-12s %-17s %-6s %-6s %s\n" "$pid" "$etime" "$detached" "$pcpu" "$pmem" "$rid" |
| done |
| fi |
|
|
| |
| echo |
| echo "$(bold "Per-run progress")" |
| echo " (epoch β latest train.log; ckpts β ep*.pt count; best_ood β latest history.json)" |
| echo |
|
|
| printf " %-46s %-7s %-9s %-8s %-8s %s\n" "RUN_ID" "EPOCH" "% DONE" "CKPTS" "BEST_OOD" "LATEST" |
| for d in experiments/runs/*/; do |
| rid=$(basename "$d") |
| log="$d/logs/train.log" |
| [ -f "$log" ] || continue |
|
|
| |
| pid_in_dir="" |
| if [ -f "$d/run.pid" ]; then |
| pid_in_dir=$(cat "$d/run.pid" 2>/dev/null | tr -d ' ') |
| fi |
| is_active="no" |
| if [ -n "$pid_in_dir" ] && kill -0 "$pid_in_dir" 2>/dev/null; then |
| is_active="yes" |
| fi |
| if [ "$is_active" = "no" ] && [[ "$rid" != 20260505-* ]]; then |
| continue |
| fi |
|
|
| |
| last_ep=$(grep -oP "ep\s+\K[0-9]+" "$log" 2>/dev/null | tail -1 || echo "0") |
| last_ep=${last_ep:-0} |
|
|
| |
| total_ep=$(grep -oP "Camelyon17 v2 \| \K[0-9]+" "$log" 2>/dev/null | head -1 || echo "?") |
| if [ -n "$total_ep" ] && [ "$total_ep" != "?" ] && [ "$last_ep" -gt 0 ]; then |
| pct=$(awk "BEGIN{printf \"%.0f\", ($last_ep/$total_ep)*100}") |
| pct="${pct}%" |
| else |
| pct="?" |
| fi |
|
|
| |
| ckpts=$(ls "$d/checkpoints/"ep*.pt 2>/dev/null | wc -l) |
|
|
| |
| best_ood="?" |
| if [ -f "$d/results/history.json" ]; then |
| best_ood=$(python3 -c " |
| import json, sys |
| try: |
| h = json.load(open('$d/results/history.json')) |
| oods = [r.get('ood_acc', 0) for r in h if isinstance(r.get('ood_acc'), (int, float))] |
| print(f'{max(oods):.3f}' if oods else '?') |
| except: print('?') |
| " 2>/dev/null) |
| fi |
|
|
| |
| latest=$(grep -E "ep\s+[0-9]+ \| tr" "$log" 2>/dev/null | tail -1 | \ |
| sed -E 's/^.*ep\s+[0-9]+ \|//; s/\| βWβ.*//' | tr -s ' ' || echo "") |
|
|
| state_icon="$(green "β")" |
| if [ "$is_active" = "no" ]; then |
| if [ "$last_ep" = "$total_ep" ]; then |
| state_icon="$(green "β")" |
| else |
| state_icon="$(red "β")" |
| fi |
| fi |
|
|
| printf " %s %-44s %-7s %-9s %-8s %-8s %s\n" \ |
| "$state_icon" "$rid" "$last_ep" "$pct" "$ckpts" "$best_ood" "${latest:0:60}" |
| done |
|
|
| |
| echo |
| echo "$(bold "Disk usage on experiments/runs")" |
| du -sh experiments/runs/ 2>/dev/null | awk '{print " " $1 " (" $2 ")"}' |
|
|
| echo |
| echo "$(gray "Re-run with: watch -n 30 bash scripts/monitor_all.sh")" |
|
|