File size: 11,163 Bytes
d7ecc62 830a330 d7ecc62 830a330 d7ecc62 830a330 d7ecc62 830a330 d7ecc62 a188746 d7ecc62 830a330 d7ecc62 830a330 d7ecc62 830a330 d7ecc62 830a330 d7ecc62 830a330 d7ecc62 830a330 d7ecc62 a188746 d7ecc62 830a330 d7ecc62 a188746 d7ecc62 830a330 d7ecc62 830a330 d7ecc62 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 | #!/usr/bin/env bash
# Manage Runpod GPU pods for PAWN experiments.
#
# Usage:
# pod.sh create <name> [--gpu <type>] [--count <n>] [--disk <gb>] [--volume <gb>] [--community]
# pod.sh start <name>
# pod.sh stop <name>
# pod.sh delete <name>
# pod.sh ssh <name>
# pod.sh list
# pod.sh gpus
# pod.sh status <name>
# pod.sh setup <name> # Run setup.sh on the pod
# pod.sh deploy <name> # Build, transfer, and setup in one step
# pod.sh launch <name> <cmd> # Run a training command via nohup
#
# Pod configs are cached in ~/.config/pawn/pods/<name>.env
# Requires: runpodctl (wget -qO- cli.runpod.net | sudo bash)
#
# GPU type shortcuts (mapped to runpodctl --gpu-id values):
# a5000 -> "NVIDIA RTX A5000"
# a40 -> "NVIDIA A40"
# a6000 -> "NVIDIA RTX 6000 Ada Generation"
# 4090 -> "NVIDIA GeForce RTX 4090"
# 5090 -> "NVIDIA GeForce RTX 5090"
# l40s -> "NVIDIA L40S"
# a100-pcie -> "NVIDIA A100 80GB PCIe"
# a100-sxm -> "NVIDIA A100-SXM4-80GB"
# h100 -> "NVIDIA H100 80GB HBM3"
# h200 -> "NVIDIA H200"
set -euo pipefail
REPO="$(cd "$(dirname "$0")/.." && pwd)"
POD_DIR="$HOME/.config/pawn/pods"
mkdir -p "$POD_DIR"
# Default pod settings
DEFAULT_GPU="NVIDIA RTX A5000"
DEFAULT_CONTAINER_DISK=20
DEFAULT_VOLUME_DISK=75
DEFAULT_IMAGE="runpod/pytorch:1.0.3-cu1281-torch280-ubuntu2404"
# --- Helpers ---
gpu_shortcut() {
case "${1,,}" in
a5000) echo "NVIDIA RTX A5000" ;;
a40) echo "NVIDIA A40" ;;
a6000) echo "NVIDIA RTX 6000 Ada Generation" ;;
4090) echo "NVIDIA GeForce RTX 4090" ;;
5090) echo "NVIDIA GeForce RTX 5090" ;;
l40s) echo "NVIDIA L40S" ;;
a100-pcie) echo "NVIDIA A100 80GB PCIe" ;;
a100-sxm) echo "NVIDIA A100-SXM4-80GB" ;;
a100) echo "NVIDIA A100 80GB PCIe" ;;
h100) echo "NVIDIA H100 80GB HBM3" ;;
h200) echo "NVIDIA H200" ;;
*) echo "$1" ;;
esac
}
save_pod_config() {
local name="$1" pod_id="$2" host="$3" port="$4" gpu="$5"
cat > "$POD_DIR/$name.env" << EOF
POD_ID=$pod_id
POD_HOST=$host
POD_PORT=$port
POD_GPU=$gpu
EOF
echo "Saved pod config to $POD_DIR/$name.env"
}
load_pod_config() {
local name="$1"
local pod_file="$POD_DIR/$name.env"
if [ ! -f "$pod_file" ]; then
echo "Error: Pod '$name' not found. Available pods:" >&2
list_local_pods >&2
exit 1
fi
source "$pod_file"
}
list_local_pods() {
local files=("$POD_DIR"/*.env)
for f in "${files[@]}"; do
[ -f "$f" ] || continue
local n="$(basename "${f%.env}")"
source "$f"
echo " $n (id=$POD_ID, gpu=${POD_GPU:-unknown})"
done
}
wait_for_pod_running() {
local pod_id="$1" name="$2"
echo -n "Waiting for pod to be ready"
for i in $(seq 1 60); do
if echo "$(runpodctl pod get "$pod_id" 2>/dev/null)" | grep -q '"gpuDisplayName"'; then
local ssh_info
ssh_info=$(runpodctl pod get "$pod_id" 2>/dev/null)
local host port
host=$(echo "$ssh_info" | grep -oP '"ip"\s*:\s*"\K[^"]+' || true)
port=$(echo "$ssh_info" | grep -oP '"publicPort"\s*:\s*\K\d+' || true)
if [ -n "$host" ] && [ -n "$port" ]; then
if ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=5 \
-p "$port" "root@$host" "echo ok" &>/dev/null; then
echo " ready!"
local gpu
gpu=$(echo "$ssh_info" | grep -oP '"gpuDisplayName"\s*:\s*"\K[^"]+' || echo "unknown")
save_pod_config "$name" "$pod_id" "$host" "$port" "$gpu"
return 0
fi
fi
fi
echo -n "."
sleep 5
done
echo " timeout!"
echo "Pod may still be starting. Check: runpodctl pod get $pod_id"
return 1
}
ssh_opts() {
echo "-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -p $POD_PORT"
}
# --- Commands ---
cmd_create() {
local name="" gpu="$DEFAULT_GPU" gpu_count=1
local container_disk="$DEFAULT_CONTAINER_DISK"
local volume_disk="$DEFAULT_VOLUME_DISK" image="$DEFAULT_IMAGE"
local cloud_type="SECURE"
name="${1:-}"
shift || true
while [ $# -gt 0 ]; do
case "$1" in
--gpu) gpu="$(gpu_shortcut "$2")"; shift 2 ;;
--count) gpu_count="$2"; shift 2 ;;
--disk) container_disk="$2"; shift 2 ;;
--volume) volume_disk="$2"; shift 2 ;;
--image) image="$2"; shift 2 ;;
--community) cloud_type="COMMUNITY"; shift ;;
*) echo "Unknown option: $1"; exit 1 ;;
esac
done
if [ -z "$name" ]; then
echo "Usage: $0 create <name> [--gpu <type>] [--count <n>] [--disk <gb>] [--volume <gb>] [--community]"
exit 1
fi
echo "Creating pod '$name'..."
echo " GPU: ${gpu_count}x $gpu"
echo " Cloud: $cloud_type"
echo " Container disk: ${container_disk}GB"
echo " Volume disk: ${volume_disk}GB"
echo " Image: $image"
echo ""
local output
output=$(runpodctl pod create \
--name "pawn-$name" \
--gpu-id "$gpu" \
--gpu-count "$gpu_count" \
--image "$image" \
--container-disk-in-gb "$container_disk" \
--volume-in-gb "$volume_disk" \
--cloud-type "$cloud_type" \
2>&1)
echo "$output"
local pod_id
pod_id=$(echo "$output" | grep -oP '[a-z0-9]{20,}' | head -1 || true)
if [ -z "$pod_id" ]; then
echo "Error: Could not extract pod ID from output"
exit 1
fi
echo "Pod ID: $pod_id"
wait_for_pod_running "$pod_id" "$name"
}
cmd_start() {
local name="${1:?Usage: $0 start <name>}"
load_pod_config "$name"
echo "Starting pod '$name' ($POD_ID)..."
runpodctl pod start "$POD_ID"
wait_for_pod_running "$POD_ID" "$name"
}
cmd_stop() {
local name="${1:?Usage: $0 stop <name>}"
load_pod_config "$name"
echo "Stopping pod '$name' ($POD_ID)..."
runpodctl pod stop "$POD_ID"
echo "Pod stopped. Volume data preserved. Resume with: $0 start $name"
}
cmd_delete() {
local name="${1:?Usage: $0 delete <name>}"
load_pod_config "$name"
read -p "Delete pod '$name' ($POD_ID)? This destroys all data. [y/N] " confirm
if [ "${confirm,,}" != "y" ]; then
echo "Cancelled."
exit 0
fi
runpodctl pod delete "$POD_ID"
rm -f "$POD_DIR/$name.env"
echo "Pod deleted and config removed."
}
cmd_ssh() {
local name="${1:?Usage: $0 ssh <name>}"
load_pod_config "$name"
ssh $(ssh_opts) "root@$POD_HOST"
}
cmd_list() {
echo "=== Remote pods (runpodctl) ==="
runpodctl pod list 2>/dev/null || echo " (runpodctl not configured or no pods)"
echo ""
echo "=== Local pod configs ==="
list_local_pods
}
cmd_gpus() {
runpodctl gpu list
}
cmd_status() {
local name="${1:?Usage: $0 status <name>}"
load_pod_config "$name"
runpodctl pod get "$POD_ID"
}
cmd_setup() {
local name="${1:?Usage: $0 setup <name>}"
load_pod_config "$name"
echo "Running setup on pod '$name'..."
ssh $(ssh_opts) "root@$POD_HOST" "cd /workspace/pawn && bash deploy/setup.sh"
}
cmd_deploy() {
local name="${1:?Usage: $0 deploy <name>}"
load_pod_config "$name"
echo "=== Full deploy to '$name' ==="
echo ""
# Step 1: Build
echo "--- Step 1: Build deploy package ---"
bash "$REPO/deploy/build.sh"
echo ""
# Step 2: Transfer
echo "--- Step 2: Transfer to pod ---"
# Install rsync on pod if needed
ssh $(ssh_opts) "root@$POD_HOST" "command -v rsync &>/dev/null || (apt-get update -qq && apt-get install -y -qq rsync)" 2>/dev/null
rsync -avz --progress -e "ssh $(ssh_opts)" \
"$REPO/deploy/pawn-deploy/" "root@$POD_HOST:/workspace/pawn/"
echo ""
# Step 3: Setup
echo "--- Step 3: Run setup ---"
ssh $(ssh_opts) "root@$POD_HOST" "cd /workspace/pawn && bash deploy/setup.sh"
echo ""
echo "=== Deploy complete ==="
}
cmd_launch() {
local name="${1:?Usage: $0 launch <name> <command...>}"
shift
local cmd="$*"
if [ -z "$cmd" ]; then
echo "Usage: $0 launch <name> <command...>"
echo ""
echo "Examples:"
echo " $0 launch exp1 scripts/train.py --variant base"
echo " $0 launch exp1 scripts/train_bottleneck.py --checkpoint checkpoints/pawn-base.pt \\"
echo " --pgn data/lichess_1800_1900.pgn --bottleneck-dim 32"
exit 1
fi
load_pod_config "$name"
local script_name
script_name=$(echo "$cmd" | grep -oP 'scripts/\K[^ ]+' | sed 's/\.py//' || echo "train")
echo "Launching on '$name': $cmd"
ssh $(ssh_opts) "root@$POD_HOST" "cd /workspace/pawn && \
nohup uv run python $cmd \
--log-dir logs \
> logs/${script_name}.log 2>&1 & \
sleep 2 && \
echo 'PID: '\$(pgrep -f '$script_name' | head -1) && \
echo 'Log: logs/${script_name}.log'"
}
# --- Main ---
case "${1:-}" in
create) shift; cmd_create "$@" ;;
start) shift; cmd_start "$@" ;;
stop) shift; cmd_stop "$@" ;;
delete) shift; cmd_delete "$@" ;;
ssh) shift; cmd_ssh "$@" ;;
list) shift; cmd_list "$@" ;;
gpus) shift; cmd_gpus "$@" ;;
status) shift; cmd_status "$@" ;;
setup) shift; cmd_setup "$@" ;;
deploy) shift; cmd_deploy "$@" ;;
launch) shift; cmd_launch "$@" ;;
sync) shift; bash "$REPO/deploy/sync.sh" "$@" ;;
*)
echo "PAWN Pod Manager"
echo ""
echo "Usage: $0 <command> [args...]"
echo ""
echo "Commands:"
echo " create <name> [--gpu <type>] [--count <n>] [--disk <gb>] [--volume <gb>] [--community]"
echo " Create a new pod (default: 1 GPU, secure cloud)"
echo " start <name> Resume a stopped pod"
echo " stop <name> Pause a pod (preserves volume, stops billing)"
echo " delete <name> Destroy a pod and its data"
echo " ssh <name> SSH into a pod"
echo " list List all pods"
echo " gpus List available GPU types"
echo " status <name> Get pod details"
echo " setup <name> Run setup.sh on the pod"
echo " deploy <name> Build + transfer + setup (full deploy)"
echo " launch <name> <cmd> Run a training command via nohup"
echo " sync [name] Sync logs/checkpoints from pod(s)"
echo ""
echo "GPU shortcuts: a5000, a40, a6000, 4090, 5090, l40s, a100, a100-pcie, a100-sxm, h100, h200"
echo ""
echo "Examples:"
echo " $0 create exp1 --gpu a5000"
echo " $0 create sweep1 --gpu a100-pcie --count 2 --community"
echo " $0 deploy exp1"
echo " $0 launch exp1 scripts/train.py --variant base"
echo " $0 stop exp1"
;;
esac
|