File size: 5,079 Bytes
42c0d23
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#!/usr/bin/env bash
# Live status of every causalgrok_camelyon_v2 process and run dir.
# Survives SSH and is safe to call repeatedly.
#
# Usage:
#   bash scripts/monitor_all.sh           # one-shot snapshot
#   watch -n 30 bash scripts/monitor_all.sh   # auto-refresh every 30s

# don't fail the whole script on any single subcommand error β€”
# we want a tolerant dashboard, not a strict pipeline
set +e

ROOT="$(cd "$(dirname "$0")/.." && pwd)"
cd "${ROOT}"

bold()  { printf "\033[1m%s\033[0m" "$*"; }
green() { printf "\033[32m%s\033[0m" "$*"; }
red()   { printf "\033[31m%s\033[0m" "$*"; }
gray()  { printf "\033[90m%s\033[0m" "$*"; }

stamp() { date -u +%FT%TZ; }

echo "═══════════════════════════════════════════════════════════════════════════════"
echo "  CausalGrok β€” all-runs monitor   $(stamp)"
echo "═══════════════════════════════════════════════════════════════════════════════"

# 1. GPU snapshot
echo
echo "$(bold "GPU")"
nvidia-smi --query-gpu=memory.used,memory.free,memory.total,utilization.gpu,temperature.gpu \
           --format=csv,noheader,nounits | \
    awk -F',' '{printf "    used=%sMB free=%sMB total=%sMB util=%s%% temp=%sΒ°C\n", $1, $2, $3, $4, $5}'

# 2. All causalgrok_camelyon_v2 processes
PIDS=$(pgrep -f "causalgrok_camelyon_v2" || true)
N_ACTIVE=$(echo "$PIDS" | grep -c . || true)

echo
echo "$(bold "Active training processes: ${N_ACTIVE}")"
if [ -z "$PIDS" ]; then
    echo "    $(gray "(none)")"
else
    printf "    %-9s %-12s %-8s %-6s %-6s %s\n" "PID" "ELAPSED" "DETACHED" "%CPU" "%MEM" "RUN_ID"
    for pid in $PIDS; do
        ppid=$(ps -o ppid= -p $pid 2>/dev/null | tr -d ' ' || echo "?")
        etime=$(ps -o etime= -p $pid 2>/dev/null | tr -d ' ' || echo "?")
        pcpu=$(ps -o pcpu= -p $pid 2>/dev/null | tr -d ' ' || echo "?")
        pmem=$(ps -o pmem= -p $pid 2>/dev/null | tr -d ' ' || echo "?")
        rid=$(ps -o cmd= -p $pid 2>/dev/null | grep -oP 'experiments/runs/\K[^ ]+' || echo "?")
        if [ "$ppid" = "1" ]; then
            detached="$(green "yes")"
        else
            detached="$(red "PPID=$ppid")"
        fi
        printf "    %-9s %-12s %-17s %-6s %-6s %s\n" "$pid" "$etime" "$detached" "$pcpu" "$pmem" "$rid"
    done
fi

# 3. Per-run progress (epoch, latest OOD, periodic ckpts)
echo
echo "$(bold "Per-run progress")"
echo "    (epoch ← latest train.log; ckpts ← ep*.pt count; best_ood ← latest history.json)"
echo

printf "    %-46s %-7s %-9s %-8s %-8s %s\n" "RUN_ID" "EPOCH" "% DONE" "CKPTS" "BEST_OOD" "LATEST"
for d in experiments/runs/*/; do
    rid=$(basename "$d")
    log="$d/logs/train.log"
    [ -f "$log" ] || continue

    # only show recent (May 5) or running ones
    pid_in_dir=""
    if [ -f "$d/run.pid" ]; then
        pid_in_dir=$(cat "$d/run.pid" 2>/dev/null | tr -d ' ')
    fi
    is_active="no"
    if [ -n "$pid_in_dir" ] && kill -0 "$pid_in_dir" 2>/dev/null; then
        is_active="yes"
    fi
    if [ "$is_active" = "no" ] && [[ "$rid" != 20260505-* ]]; then
        continue
    fi

    # latest epoch from train.log
    last_ep=$(grep -oP "ep\s+\K[0-9]+" "$log" 2>/dev/null | tail -1 || echo "0")
    last_ep=${last_ep:-0}

    # total epochs from config or log header
    total_ep=$(grep -oP "Camelyon17 v2 \| \K[0-9]+" "$log" 2>/dev/null | head -1 || echo "?")
    if [ -n "$total_ep" ] && [ "$total_ep" != "?" ] && [ "$last_ep" -gt 0 ]; then
        pct=$(awk "BEGIN{printf \"%.0f\", ($last_ep/$total_ep)*100}")
        pct="${pct}%"
    else
        pct="?"
    fi

    # ckpt count
    ckpts=$(ls "$d/checkpoints/"ep*.pt 2>/dev/null | wc -l)

    # best ood from history.json (max of ood_acc field)
    best_ood="?"
    if [ -f "$d/results/history.json" ]; then
        best_ood=$(python3 -c "
import json, sys
try:
    h = json.load(open('$d/results/history.json'))
    oods = [r.get('ood_acc', 0) for r in h if isinstance(r.get('ood_acc'), (int, float))]
    print(f'{max(oods):.3f}' if oods else '?')
except: print('?')
" 2>/dev/null)
    fi

    # latest line condensed
    latest=$(grep -E "ep\s+[0-9]+ \| tr" "$log" 2>/dev/null | tail -1 | \
             sed -E 's/^.*ep\s+[0-9]+ \|//; s/\| β€–Wβ€–.*//' | tr -s ' ' || echo "")

    state_icon="$(green "●")"
    if [ "$is_active" = "no" ]; then
        if [ "$last_ep" = "$total_ep" ]; then
            state_icon="$(green "βœ“")"
        else
            state_icon="$(red "βœ—")"
        fi
    fi

    printf "    %s %-44s %-7s %-9s %-8s %-8s %s\n" \
           "$state_icon" "$rid" "$last_ep" "$pct" "$ckpts" "$best_ood" "${latest:0:60}"
done

# 4. Disk pressure
echo
echo "$(bold "Disk usage on experiments/runs")"
du -sh experiments/runs/ 2>/dev/null | awk '{print "    " $1 " (" $2 ")"}'

echo
echo "$(gray "Re-run with: watch -n 30 bash scripts/monitor_all.sh")"