thomas-schweich commited on
Commit
5a4ed63
·
1 Parent(s): 2aee25d

Monitor script: show step time, games/sec, ETA from synced metrics

Browse files
Files changed (1) hide show
  1. scripts/monitor_training.sh +45 -10
scripts/monitor_training.sh CHANGED
@@ -42,16 +42,6 @@ if [ -n "$SSH" ]; then
42
  echo "=== Process Status ==="
43
  $SSH "pgrep -f train_all > /dev/null && echo RUNNING || echo STOPPED" 2>/dev/null || echo " (SSH failed)"
44
 
45
- echo ""
46
- echo "=== Latest Metrics ==="
47
- $SSH 'for f in /opt/pawn/logs/run_*/metrics.jsonl; do
48
- [ -f "$f" ] || continue
49
- name=$(basename $(dirname "$f"))
50
- last=$(tail -1 "$f" 2>/dev/null)
51
- step=$(echo "$last" | python3 -c "import json,sys; d=json.loads(sys.stdin.read()); print(f\"step={d.get(\"step\",\"?\"):>6} loss={d.get(\"train/loss\",d.get(\"val/loss\",\"?\")):>8.4f} acc={d.get(\"train/accuracy\",d.get(\"val/accuracy\",\"?\")):>6.3f}\")" 2>/dev/null)
52
- echo " $name: $step"
53
- done' 2>/dev/null || echo " (SSH failed)"
54
-
55
  echo ""
56
  echo "=== Metrics Sync ==="
57
  rsync -az --include='*/' --include='metrics.jsonl' --include='config.json' --exclude='*' \
@@ -59,6 +49,51 @@ if [ -n "$SSH" ]; then
59
  "root@$HOST:/opt/pawn/logs/" logs/ 2>/dev/null && echo " Synced" || echo " (Sync failed)"
60
  fi
61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  echo ""
63
  echo "=== HuggingFace Checkpoints ==="
64
  uv run python3 -c "
 
42
  echo "=== Process Status ==="
43
  $SSH "pgrep -f train_all > /dev/null && echo RUNNING || echo STOPPED" 2>/dev/null || echo " (SSH failed)"
44
 
 
 
 
 
 
 
 
 
 
 
45
  echo ""
46
  echo "=== Metrics Sync ==="
47
  rsync -az --include='*/' --include='metrics.jsonl' --include='config.json' --exclude='*' \
 
49
  "root@$HOST:/opt/pawn/logs/" logs/ 2>/dev/null && echo " Synced" || echo " (Sync failed)"
50
  fi
51
 
52
+ # Show metrics from local synced files (works with or without SSH)
53
+ echo ""
54
+ echo "=== Training Progress ==="
55
+ python3 -c "
56
+ import json, statistics, glob, os
57
+
58
+ for f in sorted(glob.glob('logs/run_*/metrics.jsonl')):
59
+ run = os.path.basename(os.path.dirname(f))
60
+ recs = []
61
+ with open(f) as fh:
62
+ for line in fh:
63
+ try: recs.append(json.loads(line.strip()))
64
+ except: pass
65
+
66
+ train = [r for r in recs if r.get('type') == 'train' and r.get('step', 0) > 10]
67
+ val = [r for r in recs if r.get('type') == 'val']
68
+ if not train:
69
+ continue
70
+
71
+ last = train[-1]
72
+ times = [r['step_time'] for r in train if 'step_time' in r]
73
+ gps = [r['games_per_sec'] for r in train if 'games_per_sec' in r]
74
+ med_t = statistics.median(times) if times else 0
75
+ med_gps = statistics.median(gps) if gps else 0
76
+
77
+ step = last.get('step', 0)
78
+ loss = last.get('train/loss', 0)
79
+ acc = last.get('train/accuracy', 0)
80
+
81
+ # Val metrics
82
+ val_str = ''
83
+ if val:
84
+ lv = val[-1]
85
+ val_str = f\" val_loss={lv.get('val/loss',0):.4f}\"
86
+
87
+ # ETA
88
+ cfg = next((r for r in recs if r.get('type') == 'config'), {})
89
+ total = cfg.get('training', {}).get('total_steps', 100000)
90
+ remaining_h = (total - step) * med_t / 3600 if med_t else 0
91
+
92
+ print(f' {run}')
93
+ print(f' step {step:>6}/{total} loss={loss:.4f} acc={acc:.3f}{val_str}')
94
+ print(f' {med_t:.3f}s/step {med_gps:.0f} g/s ETA {remaining_h:.1f}h')
95
+ " 2>/dev/null || echo " (no local metrics)"
96
+
97
  echo ""
98
  echo "=== HuggingFace Checkpoints ==="
99
  uv run python3 -c "