Commit ·
660f2d0
1
Parent(s): a217e81
Remove hardcoded IP from monitor script, resolve SSH via runpodctl
Browse files- scripts/monitor_training.sh +48 -15
scripts/monitor_training.sh
CHANGED
|
@@ -1,18 +1,57 @@
|
|
| 1 |
#!/usr/bin/env bash
|
| 2 |
# Monitor multi-model training: check pod log + HuggingFace checkpoints.
|
| 3 |
-
# Usage: bash scripts/monitor_training.sh <
|
|
|
|
|
|
|
|
|
|
| 4 |
set -euo pipefail
|
| 5 |
|
| 6 |
-
|
| 7 |
-
|
| 8 |
-
SSH="ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -p $PORT root@$HOST"
|
| 9 |
|
| 10 |
-
|
| 11 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 12 |
|
| 13 |
-
|
| 14 |
-
echo "===
|
| 15 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
echo ""
|
| 18 |
echo "=== HuggingFace Checkpoints ==="
|
|
@@ -32,9 +71,3 @@ for variant in ['small', 'base', 'large']:
|
|
| 32 |
except Exception as e:
|
| 33 |
print(f' {repo}: {e}')
|
| 34 |
" 2>/dev/null || echo " (HF check failed)"
|
| 35 |
-
|
| 36 |
-
echo ""
|
| 37 |
-
echo "=== Metrics Sync ==="
|
| 38 |
-
rsync -az --include='*/' --include='metrics.jsonl' --include='config.json' --exclude='*' \
|
| 39 |
-
-e "ssh -o StrictHostKeyChecking=accept-new -p $PORT" \
|
| 40 |
-
"root@$HOST:/workspace/logs/" logs/ 2>/dev/null && echo " Synced" || echo " (Sync failed)"
|
|
|
|
| 1 |
#!/usr/bin/env bash
|
| 2 |
# Monitor multi-model training: check pod log + HuggingFace checkpoints.
|
| 3 |
+
# Usage: bash scripts/monitor_training.sh [<pod-id>]
|
| 4 |
+
#
|
| 5 |
+
# If pod-id is given, resolves SSH host/port via runpodctl.
|
| 6 |
+
# Otherwise checks HuggingFace only (no SSH).
|
| 7 |
set -euo pipefail
|
| 8 |
|
| 9 |
+
POD_ID="${1:-}"
|
| 10 |
+
SSH=""
|
|
|
|
| 11 |
|
| 12 |
+
if [ -n "$POD_ID" ]; then
|
| 13 |
+
# Resolve SSH connection from runpodctl
|
| 14 |
+
ssh_info=$(runpodctl pod get "$POD_ID" 2>/dev/null | python3 -c "
|
| 15 |
+
import json, sys
|
| 16 |
+
d = json.load(sys.stdin)
|
| 17 |
+
ssh = d.get('ssh', {})
|
| 18 |
+
host = ssh.get('host', '')
|
| 19 |
+
port = ssh.get('port', '')
|
| 20 |
+
status = ssh.get('status', '')
|
| 21 |
+
error = ssh.get('error', '')
|
| 22 |
+
if host and port:
|
| 23 |
+
print(f'{host} {port}')
|
| 24 |
+
elif error:
|
| 25 |
+
print(f'ERROR {error}')
|
| 26 |
+
else:
|
| 27 |
+
print(f'ERROR status={status}')
|
| 28 |
+
" 2>/dev/null || echo "ERROR runpodctl-failed")
|
| 29 |
|
| 30 |
+
if [[ "$ssh_info" == ERROR* ]]; then
|
| 31 |
+
echo "=== Pod Status ==="
|
| 32 |
+
echo " Pod $POD_ID: ${ssh_info#ERROR }"
|
| 33 |
+
echo ""
|
| 34 |
+
else
|
| 35 |
+
HOST=$(echo "$ssh_info" | cut -d' ' -f1)
|
| 36 |
+
PORT=$(echo "$ssh_info" | cut -d' ' -f2)
|
| 37 |
+
SSH="ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -p $PORT root@$HOST"
|
| 38 |
+
fi
|
| 39 |
+
fi
|
| 40 |
+
|
| 41 |
+
if [ -n "$SSH" ]; then
|
| 42 |
+
echo "=== Training Log ==="
|
| 43 |
+
$SSH "tail -15 /opt/pawn/logs/*/metrics.jsonl 2>/dev/null | tail -15" 2>/dev/null || echo " (SSH failed)"
|
| 44 |
+
|
| 45 |
+
echo ""
|
| 46 |
+
echo "=== Process Status ==="
|
| 47 |
+
$SSH "pgrep -f train_all > /dev/null && echo RUNNING || echo STOPPED" 2>/dev/null || echo " (SSH failed)"
|
| 48 |
+
|
| 49 |
+
echo ""
|
| 50 |
+
echo "=== Metrics Sync ==="
|
| 51 |
+
rsync -az --include='*/' --include='metrics.jsonl' --include='config.json' --exclude='*' \
|
| 52 |
+
-e "ssh -o StrictHostKeyChecking=accept-new -p $PORT" \
|
| 53 |
+
"root@$HOST:/opt/pawn/logs/" logs/ 2>/dev/null && echo " Synced" || echo " (Sync failed)"
|
| 54 |
+
fi
|
| 55 |
|
| 56 |
echo ""
|
| 57 |
echo "=== HuggingFace Checkpoints ==="
|
|
|
|
| 71 |
except Exception as e:
|
| 72 |
print(f' {repo}: {e}')
|
| 73 |
" 2>/dev/null || echo " (HF check failed)"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|