thomas-schweich commited on
Commit
660f2d0
·
1 Parent(s): a217e81

Remove hardcoded IP from monitor script, resolve SSH via runpodctl

Browse files
Files changed (1) hide show
  1. scripts/monitor_training.sh +48 -15
scripts/monitor_training.sh CHANGED
@@ -1,18 +1,57 @@
1
  #!/usr/bin/env bash
2
  # Monitor multi-model training: check pod log + HuggingFace checkpoints.
3
- # Usage: bash scripts/monitor_training.sh <host> <port>
 
 
 
4
  set -euo pipefail
5
 
6
- HOST="${1:-50.145.48.110}"
7
- PORT="${2:-13321}"
8
- SSH="ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -p $PORT root@$HOST"
9
 
10
- echo "=== Training Log ==="
11
- $SSH "tail -15 /workspace/logs/train_all.log" 2>/dev/null || echo " (SSH failed)"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
 
13
- echo ""
14
- echo "=== Process Status ==="
15
- $SSH "pgrep -f train_all > /dev/null && echo RUNNING || echo STOPPED" 2>/dev/null || echo " (SSH failed)"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
  echo ""
18
  echo "=== HuggingFace Checkpoints ==="
@@ -32,9 +71,3 @@ for variant in ['small', 'base', 'large']:
32
  except Exception as e:
33
  print(f' {repo}: {e}')
34
  " 2>/dev/null || echo " (HF check failed)"
35
-
36
- echo ""
37
- echo "=== Metrics Sync ==="
38
- rsync -az --include='*/' --include='metrics.jsonl' --include='config.json' --exclude='*' \
39
- -e "ssh -o StrictHostKeyChecking=accept-new -p $PORT" \
40
- "root@$HOST:/workspace/logs/" logs/ 2>/dev/null && echo " Synced" || echo " (Sync failed)"
 
1
  #!/usr/bin/env bash
2
  # Monitor multi-model training: check pod log + HuggingFace checkpoints.
3
+ # Usage: bash scripts/monitor_training.sh [<pod-id>]
4
+ #
5
+ # If pod-id is given, resolves SSH host/port via runpodctl.
6
+ # Otherwise checks HuggingFace only (no SSH).
7
  set -euo pipefail
8
 
9
+ POD_ID="${1:-}"
10
+ SSH=""
 
11
 
12
+ if [ -n "$POD_ID" ]; then
13
+ # Resolve SSH connection from runpodctl
14
+ ssh_info=$(runpodctl pod get "$POD_ID" 2>/dev/null | python3 -c "
15
+ import json, sys
16
+ d = json.load(sys.stdin)
17
+ ssh = d.get('ssh', {})
18
+ host = ssh.get('host', '')
19
+ port = ssh.get('port', '')
20
+ status = ssh.get('status', '')
21
+ error = ssh.get('error', '')
22
+ if host and port:
23
+ print(f'{host} {port}')
24
+ elif error:
25
+ print(f'ERROR {error}')
26
+ else:
27
+ print(f'ERROR status={status}')
28
+ " 2>/dev/null || echo "ERROR runpodctl-failed")
29
 
30
+ if [[ "$ssh_info" == ERROR* ]]; then
31
+ echo "=== Pod Status ==="
32
+ echo " Pod $POD_ID: ${ssh_info#ERROR }"
33
+ echo ""
34
+ else
35
+ HOST=$(echo "$ssh_info" | cut -d' ' -f1)
36
+ PORT=$(echo "$ssh_info" | cut -d' ' -f2)
37
+ SSH="ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -p $PORT root@$HOST"
38
+ fi
39
+ fi
40
+
41
+ if [ -n "$SSH" ]; then
42
+ echo "=== Training Log ==="
43
+ $SSH "tail -15 /opt/pawn/logs/*/metrics.jsonl 2>/dev/null | tail -15" 2>/dev/null || echo " (SSH failed)"
44
+
45
+ echo ""
46
+ echo "=== Process Status ==="
47
+ $SSH "pgrep -f train_all > /dev/null && echo RUNNING || echo STOPPED" 2>/dev/null || echo " (SSH failed)"
48
+
49
+ echo ""
50
+ echo "=== Metrics Sync ==="
51
+ rsync -az --include='*/' --include='metrics.jsonl' --include='config.json' --exclude='*' \
52
+ -e "ssh -o StrictHostKeyChecking=accept-new -p $PORT" \
53
+ "root@$HOST:/opt/pawn/logs/" logs/ 2>/dev/null && echo " Synced" || echo " (Sync failed)"
54
+ fi
55
 
56
  echo ""
57
  echo "=== HuggingFace Checkpoints ==="
 
71
  except Exception as e:
72
  print(f' {repo}: {e}')
73
  " 2>/dev/null || echo " (HF check failed)"