zombiee / scripts /gpu_hold.sh
EeshanSingh's picture
commit file change
636ec72
#!/usr/bin/env bash
# gpu_hold.sh — reserve VRAM on a single DGX GPU using the existing
# survivecity-train docker image (so we don't need torch installed on the host).
#
# Usage:
# ./scripts/gpu_hold.sh 3 # hold ~30 GB on GPU 3
# GB=25 ./scripts/gpu_hold.sh 3 # custom size
# IMAGE=my-img ./scripts/gpu_hold.sh 3 # custom image (default: survivecity-train)
#
# Stop:
# docker rm -f gpu-hold-3
#
# Lists all running holders:
# docker ps --filter name=^gpu-hold-
#
# Don't squat more than one GPU on a shared cluster.
set -euo pipefail
if [[ $# -lt 1 ]]; then
echo "usage: $0 <gpu_index> (e.g. $0 3)" >&2
exit 1
fi
GPU="$1"
GB="${GB:-30}"
IMAGE="${IMAGE:-survivecity-train}"
NAME="gpu-hold-${GPU}"
if ! command -v docker >/dev/null 2>&1; then
echo "FATAL: docker not on PATH." >&2
exit 1
fi
if ! docker image inspect "$IMAGE" >/dev/null 2>&1; then
echo "FATAL: docker image '$IMAGE' not found. Build it first:" >&2
echo " docker build -f Dockerfile.dgx -t $IMAGE ." >&2
exit 1
fi
if docker ps --filter "name=^${NAME}$" --format '{{.Names}}' | grep -qx "$NAME"; then
echo "Already running: $NAME" >&2
docker ps --filter "name=^${NAME}$"
exit 0
fi
echo "[gpu_hold] launching $NAME on GPU $GPU (holding ${GB} GB) using image $IMAGE"
# -d detached so it survives SSH disconnect
# --rm so it cleans up on stop (no state worth keeping)
# -e GPU_HOLD_GB so the python script knows how much to grab
# Mount the script in case the image's copy is stale; falls back to /app/scripts
# if the host path doesn't exist (running from a different cwd).
SCRIPT_HOST="$(cd "$(dirname "$0")" && pwd)/gpu_hold.py"
if [[ -f "$SCRIPT_HOST" ]]; then
MOUNT_ARGS=(-v "${SCRIPT_HOST}:/tmp/gpu_hold.py:ro")
SCRIPT_PATH="/tmp/gpu_hold.py"
else
MOUNT_ARGS=()
SCRIPT_PATH="/app/scripts/gpu_hold.py"
fi
docker run -d --rm \
--name "$NAME" \
--gpus "\"device=${GPU}\"" \
-e GPU_HOLD_GB="$GB" \
"${MOUNT_ARGS[@]}" \
"$IMAGE" \
python "$SCRIPT_PATH" >/dev/null
sleep 4
echo
docker logs "$NAME" 2>&1 | tail -5
echo
echo "[gpu_hold] running. Stop with: docker rm -f $NAME"