File size: 4,076 Bytes
87b2fa6
 
660f2d0
 
 
 
87b2fa6
 
660f2d0
 
87b2fa6
660f2d0
 
 
 
 
 
a47b56d
660f2d0
 
 
 
 
 
 
 
 
 
87b2fa6
660f2d0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87b2fa6
5a4ed63
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87b2fa6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
#!/usr/bin/env bash
# Monitor multi-model training: check pod log + HuggingFace checkpoints.
# Usage: bash scripts/monitor_training.sh [<pod-id>]
#
# If pod-id is given, resolves SSH host/port via runpodctl.
# Otherwise checks HuggingFace only (no SSH).
set -euo pipefail

POD_ID="${1:-}"
SSH=""

if [ -n "$POD_ID" ]; then
    # Resolve SSH connection from runpodctl
    ssh_info=$(runpodctl pod get "$POD_ID" 2>/dev/null | python3 -c "
import json, sys
d = json.load(sys.stdin)
ssh = d.get('ssh', {})
host = ssh.get('ip', '') or ssh.get('host', '')
port = ssh.get('port', '')
status = ssh.get('status', '')
error = ssh.get('error', '')
if host and port:
    print(f'{host} {port}')
elif error:
    print(f'ERROR {error}')
else:
    print(f'ERROR status={status}')
" 2>/dev/null || echo "ERROR runpodctl-failed")

    if [[ "$ssh_info" == ERROR* ]]; then
        echo "=== Pod Status ==="
        echo "  Pod $POD_ID: ${ssh_info#ERROR }"
        echo ""
    else
        HOST=$(echo "$ssh_info" | cut -d' ' -f1)
        PORT=$(echo "$ssh_info" | cut -d' ' -f2)
        SSH="ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -p $PORT root@$HOST"
    fi
fi

if [ -n "$SSH" ]; then
    echo "=== Process Status ==="
    $SSH "pgrep -f train_all > /dev/null && echo RUNNING || echo STOPPED" 2>/dev/null || echo "  (SSH failed)"

    echo ""
    echo "=== Metrics Sync ==="
    rsync -az --include='*/' --include='metrics.jsonl' --include='config.json' --exclude='*' \
        -e "ssh -o StrictHostKeyChecking=accept-new -p $PORT" \
        "root@$HOST:/opt/pawn/logs/" logs/ 2>/dev/null && echo "  Synced" || echo "  (Sync failed)"
fi

# Show metrics from local synced files (works with or without SSH)
echo ""
echo "=== Training Progress ==="
python3 -c "
import json, statistics, glob, os

for f in sorted(glob.glob('logs/run_*/metrics.jsonl')):
    run = os.path.basename(os.path.dirname(f))
    recs = []
    with open(f) as fh:
        for line in fh:
            try: recs.append(json.loads(line.strip()))
            except: pass

    train = [r for r in recs if r.get('type') == 'train' and r.get('step', 0) > 10]
    val = [r for r in recs if r.get('type') == 'val']
    if not train:
        continue

    last = train[-1]
    times = [r['step_time'] for r in train if 'step_time' in r]
    gps = [r['games_per_sec'] for r in train if 'games_per_sec' in r]
    med_t = statistics.median(times) if times else 0
    med_gps = statistics.median(gps) if gps else 0

    step = last.get('step', 0)
    loss = last.get('train/loss', 0)
    acc = last.get('train/accuracy', 0)

    # Val metrics
    val_str = ''
    if val:
        lv = val[-1]
        val_str = f\"  val_loss={lv.get('val/loss',0):.4f}\"

    # ETA
    cfg = next((r for r in recs if r.get('type') == 'config'), {})
    total = cfg.get('training', {}).get('total_steps', 100000)
    remaining_h = (total - step) * med_t / 3600 if med_t else 0

    print(f'  {run}')
    print(f'    step {step:>6}/{total}  loss={loss:.4f}  acc={acc:.3f}{val_str}')
    print(f'    {med_t:.3f}s/step  {med_gps:.0f} g/s  ETA {remaining_h:.1f}h')
" 2>/dev/null || echo "  (no local metrics)"

echo ""
echo "=== HuggingFace Checkpoints ==="
uv run python3 -c "
from huggingface_hub import HfApi
api = HfApi()
for variant in ['small', 'base', 'large']:
    repo = f'thomas-schweich/pawn-{variant}'
    try:
        branches = [b.name for b in api.list_repo_refs(repo, repo_type='model').branches if b.name.startswith('run/')]
        for branch in branches:
            files = [f.rfilename for f in api.list_repo_tree(repo, revision=branch, repo_type='model', recursive=True) if hasattr(f, 'rfilename') and 'checkpoints/' in f.rfilename]
            ckpts = sorted(set(f.split('/')[1] for f in files if f.startswith('checkpoints/step_')))
            print(f'  {repo}@{branch}: {len(ckpts)} checkpoints ({ckpts[-1] if ckpts else \"none\"})')
        if not branches:
            print(f'  {repo}: no run branches')
    except Exception as e:
        print(f'  {repo}: {e}')
" 2>/dev/null || echo "  (HF check failed)"