#!/usr/bin/env bash # Manage Runpod GPU pods for PAWN experiments. # # Usage: # pod.sh create [--gpu ] [--count ] [--disk ] [--volume ] [--community] # pod.sh start # pod.sh stop # pod.sh delete # pod.sh ssh # pod.sh list # pod.sh gpus # pod.sh status # pod.sh setup # Run setup.sh on the pod # pod.sh deploy # Build, transfer, and setup in one step # pod.sh launch # Run a training command via nohup # # Pod configs are cached in ~/.config/pawn/pods/.env # Requires: runpodctl (wget -qO- cli.runpod.net | sudo bash) # # GPU type shortcuts (mapped to runpodctl --gpu-id values): # a5000 -> "NVIDIA RTX A5000" # a40 -> "NVIDIA A40" # a6000 -> "NVIDIA RTX 6000 Ada Generation" # 4090 -> "NVIDIA GeForce RTX 4090" # 5090 -> "NVIDIA GeForce RTX 5090" # l40s -> "NVIDIA L40S" # a100-pcie -> "NVIDIA A100 80GB PCIe" # a100-sxm -> "NVIDIA A100-SXM4-80GB" # h100 -> "NVIDIA H100 80GB HBM3" # h200 -> "NVIDIA H200" set -euo pipefail REPO="$(cd "$(dirname "$0")/.." && pwd)" POD_DIR="$HOME/.config/pawn/pods" mkdir -p "$POD_DIR" # Default pod settings DEFAULT_GPU="NVIDIA RTX A5000" DEFAULT_CONTAINER_DISK=20 DEFAULT_VOLUME_DISK=75 DEFAULT_IMAGE="runpod/pytorch:1.0.3-cu1281-torch280-ubuntu2404" # --- Helpers --- gpu_shortcut() { case "${1,,}" in a5000) echo "NVIDIA RTX A5000" ;; a40) echo "NVIDIA A40" ;; a6000) echo "NVIDIA RTX 6000 Ada Generation" ;; 4090) echo "NVIDIA GeForce RTX 4090" ;; 5090) echo "NVIDIA GeForce RTX 5090" ;; l40s) echo "NVIDIA L40S" ;; a100-pcie) echo "NVIDIA A100 80GB PCIe" ;; a100-sxm) echo "NVIDIA A100-SXM4-80GB" ;; a100) echo "NVIDIA A100 80GB PCIe" ;; h100) echo "NVIDIA H100 80GB HBM3" ;; h200) echo "NVIDIA H200" ;; *) echo "$1" ;; esac } save_pod_config() { local name="$1" pod_id="$2" host="$3" port="$4" gpu="$5" cat > "$POD_DIR/$name.env" << EOF POD_ID=$pod_id POD_HOST=$host POD_PORT=$port POD_GPU=$gpu EOF echo "Saved pod config to $POD_DIR/$name.env" } load_pod_config() { local name="$1" local pod_file="$POD_DIR/$name.env" if [ ! -f "$pod_file" ]; then echo "Error: Pod '$name' not found. Available pods:" >&2 list_local_pods >&2 exit 1 fi source "$pod_file" } list_local_pods() { local files=("$POD_DIR"/*.env) for f in "${files[@]}"; do [ -f "$f" ] || continue local n="$(basename "${f%.env}")" source "$f" echo " $n (id=$POD_ID, gpu=${POD_GPU:-unknown})" done } wait_for_pod_running() { local pod_id="$1" name="$2" echo -n "Waiting for pod to be ready" for i in $(seq 1 60); do if echo "$(runpodctl pod get "$pod_id" 2>/dev/null)" | grep -q '"gpuDisplayName"'; then local ssh_info ssh_info=$(runpodctl pod get "$pod_id" 2>/dev/null) local host port host=$(echo "$ssh_info" | grep -oP '"ip"\s*:\s*"\K[^"]+' || true) port=$(echo "$ssh_info" | grep -oP '"publicPort"\s*:\s*\K\d+' || true) if [ -n "$host" ] && [ -n "$port" ]; then if ssh -o StrictHostKeyChecking=accept-new -o ConnectTimeout=5 \ -p "$port" "root@$host" "echo ok" &>/dev/null; then echo " ready!" local gpu gpu=$(echo "$ssh_info" | grep -oP '"gpuDisplayName"\s*:\s*"\K[^"]+' || echo "unknown") save_pod_config "$name" "$pod_id" "$host" "$port" "$gpu" return 0 fi fi fi echo -n "." sleep 5 done echo " timeout!" echo "Pod may still be starting. Check: runpodctl pod get $pod_id" return 1 } ssh_opts() { echo "-o StrictHostKeyChecking=accept-new -o ConnectTimeout=10 -p $POD_PORT" } # --- Commands --- cmd_create() { local name="" gpu="$DEFAULT_GPU" gpu_count=1 local container_disk="$DEFAULT_CONTAINER_DISK" local volume_disk="$DEFAULT_VOLUME_DISK" image="$DEFAULT_IMAGE" local cloud_type="SECURE" name="${1:-}" shift || true while [ $# -gt 0 ]; do case "$1" in --gpu) gpu="$(gpu_shortcut "$2")"; shift 2 ;; --count) gpu_count="$2"; shift 2 ;; --disk) container_disk="$2"; shift 2 ;; --volume) volume_disk="$2"; shift 2 ;; --image) image="$2"; shift 2 ;; --community) cloud_type="COMMUNITY"; shift ;; *) echo "Unknown option: $1"; exit 1 ;; esac done if [ -z "$name" ]; then echo "Usage: $0 create [--gpu ] [--count ] [--disk ] [--volume ] [--community]" exit 1 fi echo "Creating pod '$name'..." echo " GPU: ${gpu_count}x $gpu" echo " Cloud: $cloud_type" echo " Container disk: ${container_disk}GB" echo " Volume disk: ${volume_disk}GB" echo " Image: $image" echo "" local output output=$(runpodctl pod create \ --name "pawn-$name" \ --gpu-id "$gpu" \ --gpu-count "$gpu_count" \ --image "$image" \ --container-disk-in-gb "$container_disk" \ --volume-in-gb "$volume_disk" \ --cloud-type "$cloud_type" \ 2>&1) echo "$output" local pod_id pod_id=$(echo "$output" | grep -oP '[a-z0-9]{20,}' | head -1 || true) if [ -z "$pod_id" ]; then echo "Error: Could not extract pod ID from output" exit 1 fi echo "Pod ID: $pod_id" wait_for_pod_running "$pod_id" "$name" } cmd_start() { local name="${1:?Usage: $0 start }" load_pod_config "$name" echo "Starting pod '$name' ($POD_ID)..." runpodctl pod start "$POD_ID" wait_for_pod_running "$POD_ID" "$name" } cmd_stop() { local name="${1:?Usage: $0 stop }" load_pod_config "$name" echo "Stopping pod '$name' ($POD_ID)..." runpodctl pod stop "$POD_ID" echo "Pod stopped. Volume data preserved. Resume with: $0 start $name" } cmd_delete() { local name="${1:?Usage: $0 delete }" load_pod_config "$name" read -p "Delete pod '$name' ($POD_ID)? This destroys all data. [y/N] " confirm if [ "${confirm,,}" != "y" ]; then echo "Cancelled." exit 0 fi runpodctl pod delete "$POD_ID" rm -f "$POD_DIR/$name.env" echo "Pod deleted and config removed." } cmd_ssh() { local name="${1:?Usage: $0 ssh }" load_pod_config "$name" ssh $(ssh_opts) "root@$POD_HOST" } cmd_list() { echo "=== Remote pods (runpodctl) ===" runpodctl pod list 2>/dev/null || echo " (runpodctl not configured or no pods)" echo "" echo "=== Local pod configs ===" list_local_pods } cmd_gpus() { runpodctl gpu list } cmd_status() { local name="${1:?Usage: $0 status }" load_pod_config "$name" runpodctl pod get "$POD_ID" } cmd_setup() { local name="${1:?Usage: $0 setup }" load_pod_config "$name" echo "Running setup on pod '$name'..." ssh $(ssh_opts) "root@$POD_HOST" "cd /workspace/pawn && bash deploy/setup.sh" } cmd_deploy() { local name="${1:?Usage: $0 deploy }" load_pod_config "$name" echo "=== Full deploy to '$name' ===" echo "" # Step 1: Build echo "--- Step 1: Build deploy package ---" bash "$REPO/deploy/build.sh" echo "" # Step 2: Transfer echo "--- Step 2: Transfer to pod ---" # Install rsync on pod if needed ssh $(ssh_opts) "root@$POD_HOST" "command -v rsync &>/dev/null || (apt-get update -qq && apt-get install -y -qq rsync)" 2>/dev/null rsync -avz --progress -e "ssh $(ssh_opts)" \ "$REPO/deploy/pawn-deploy/" "root@$POD_HOST:/workspace/pawn/" echo "" # Step 3: Setup echo "--- Step 3: Run setup ---" ssh $(ssh_opts) "root@$POD_HOST" "cd /workspace/pawn && bash deploy/setup.sh" echo "" echo "=== Deploy complete ===" } cmd_launch() { local name="${1:?Usage: $0 launch }" shift local cmd="$*" if [ -z "$cmd" ]; then echo "Usage: $0 launch " echo "" echo "Examples:" echo " $0 launch exp1 scripts/train.py --variant base" echo " $0 launch exp1 scripts/train_bottleneck.py --checkpoint checkpoints/pawn-base.pt \\" echo " --pgn data/lichess_1800_1900.pgn --bottleneck-dim 32" exit 1 fi load_pod_config "$name" local script_name script_name=$(echo "$cmd" | grep -oP 'scripts/\K[^ ]+' | sed 's/\.py//' || echo "train") echo "Launching on '$name': $cmd" ssh $(ssh_opts) "root@$POD_HOST" "cd /workspace/pawn && \ nohup uv run python $cmd \ --log-dir logs \ > logs/${script_name}.log 2>&1 & \ sleep 2 && \ echo 'PID: '\$(pgrep -f '$script_name' | head -1) && \ echo 'Log: logs/${script_name}.log'" } # --- Main --- case "${1:-}" in create) shift; cmd_create "$@" ;; start) shift; cmd_start "$@" ;; stop) shift; cmd_stop "$@" ;; delete) shift; cmd_delete "$@" ;; ssh) shift; cmd_ssh "$@" ;; list) shift; cmd_list "$@" ;; gpus) shift; cmd_gpus "$@" ;; status) shift; cmd_status "$@" ;; setup) shift; cmd_setup "$@" ;; deploy) shift; cmd_deploy "$@" ;; launch) shift; cmd_launch "$@" ;; sync) shift; bash "$REPO/deploy/sync.sh" "$@" ;; *) echo "PAWN Pod Manager" echo "" echo "Usage: $0 [args...]" echo "" echo "Commands:" echo " create [--gpu ] [--count ] [--disk ] [--volume ] [--community]" echo " Create a new pod (default: 1 GPU, secure cloud)" echo " start Resume a stopped pod" echo " stop Pause a pod (preserves volume, stops billing)" echo " delete Destroy a pod and its data" echo " ssh SSH into a pod" echo " list List all pods" echo " gpus List available GPU types" echo " status Get pod details" echo " setup Run setup.sh on the pod" echo " deploy Build + transfer + setup (full deploy)" echo " launch Run a training command via nohup" echo " sync [name] Sync logs/checkpoints from pod(s)" echo "" echo "GPU shortcuts: a5000, a40, a6000, 4090, 5090, l40s, a100, a100-pcie, a100-sxm, h100, h200" echo "" echo "Examples:" echo " $0 create exp1 --gpu a5000" echo " $0 create sweep1 --gpu a100-pcie --count 2 --community" echo " $0 deploy exp1" echo " $0 launch exp1 scripts/train.py --variant base" echo " $0 stop exp1" ;; esac