File size: 3,298 Bytes
f5fd4da
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#!/usr/bin/env bash
# Migrate an Optuna sweep from one Runpod pod to another.
#
# Usage:
#   bash scripts/migrate_sweep.sh <old-pod-id> <new-pod-id> [remote-dir]
#
# Steps:
#   1. Resolves SSH for both pods via runpodctl
#   2. Syncs sweep directory from old pod to local staging
#   3. Syncs from local staging to new pod
#   4. Reports what was transferred
#
# The remote directory defaults to /workspace/sweeps.
set -euo pipefail

OLD_POD="${1:?Usage: migrate_sweep.sh <old-pod-id> <new-pod-id> [remote-dir]}"
NEW_POD="${2:?Usage: migrate_sweep.sh <old-pod-id> <new-pod-id> [remote-dir]}"
REMOTE_DIR="${3:-/workspace/sweeps}"
LOCAL_STAGING="/tmp/pawn_sweep_migrate"

resolve_ssh() {
    local pod_id="$1"
    runpodctl pod get "$pod_id" 2>/dev/null | python3 -c "
import json, sys
d = json.load(sys.stdin)
ssh = d.get('ssh', {})
ip = ssh.get('ip', '') or ssh.get('host', '')
port = ssh.get('port', '')
if ip and port:
    print(f'{ip} {port}')
else:
    print('ERROR ' + ssh.get('error', 'unknown'))
" 2>/dev/null
}

echo "=== Migrate Sweep ==="
echo "Old pod: $OLD_POD"
echo "New pod: $NEW_POD"
echo "Remote dir: $REMOTE_DIR"
echo ""

# Resolve SSH for old pod
echo "Resolving old pod SSH..."
old_ssh=$(resolve_ssh "$OLD_POD")
if [[ "$old_ssh" == ERROR* ]]; then
    echo "  Old pod SSH failed: $old_ssh"
    exit 1
fi
OLD_HOST=$(echo "$old_ssh" | cut -d' ' -f1)
OLD_PORT=$(echo "$old_ssh" | cut -d' ' -f2)
echo "  $OLD_HOST:$OLD_PORT"

# Resolve SSH for new pod
echo "Resolving new pod SSH..."
new_ssh=$(resolve_ssh "$NEW_POD")
if [[ "$new_ssh" == ERROR* ]]; then
    echo "  New pod SSH failed: $new_ssh"
    exit 1
fi
NEW_HOST=$(echo "$new_ssh" | cut -d' ' -f1)
NEW_PORT=$(echo "$new_ssh" | cut -d' ' -f2)
echo "  $NEW_HOST:$NEW_PORT"

# Pull from old pod
echo ""
echo "Pulling from old pod..."
mkdir -p "$LOCAL_STAGING"
rsync -az --progress \
    -e "ssh -o StrictHostKeyChecking=accept-new -p $OLD_PORT" \
    "root@$OLD_HOST:$REMOTE_DIR/" "$LOCAL_STAGING/"
echo "  Pulled to $LOCAL_STAGING"

# Show what we got
echo ""
echo "=== Transferred ==="
du -sh "$LOCAL_STAGING"
find "$LOCAL_STAGING" -name "study.db" -exec ls -lh {} \;
echo "Trial dirs: $(find "$LOCAL_STAGING" -maxdepth 2 -name 'trial_*' -type d | wc -l)"
echo "Metrics files: $(find "$LOCAL_STAGING" -name 'metrics.jsonl' | wc -l)"

# Push to new pod
echo ""
echo "Pushing to new pod..."
ssh -o StrictHostKeyChecking=accept-new -p "$NEW_PORT" "root@$NEW_HOST" "mkdir -p $REMOTE_DIR"
rsync -az --no-owner --no-group --progress \
    -e "ssh -o StrictHostKeyChecking=accept-new -p $NEW_PORT" \
    "$LOCAL_STAGING/" "root@$NEW_HOST:$REMOTE_DIR/"
echo "  Pushed to $NEW_HOST:$REMOTE_DIR"

# Inject HF token on new pod
if [ -f "$HOME/.cache/huggingface/token" ]; then
    echo ""
    echo "Injecting HF token on new pod..."
    cat "$HOME/.cache/huggingface/token" | ssh -p "$NEW_PORT" "root@$NEW_HOST" \
        'mkdir -p /root/.cache/huggingface && cat > /root/.cache/huggingface/token'
    echo "  Done"
fi

echo ""
echo "=== Migration complete ==="
echo "Old pod ($OLD_POD) can now be stopped."
echo "Resume sweep on new pod ($NEW_POD) with:"
echo "  python scripts/sweep.py --adapter architecture --checkpoint dummy \\"
echo "    --n-trials 30 --n-jobs 1 --n-gpus 1 --total-steps 20000 \\"
echo "    --output-dir $REMOTE_DIR"