| #!/usr/bin/env bash |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
|
|
| set -euo pipefail |
|
|
| if [[ $# -lt 1 ]]; then |
| echo "usage: $0 <gpu_index> (e.g. $0 3)" >&2 |
| exit 1 |
| fi |
|
|
| GPU="$1" |
| GB="${GB:-30}" |
| IMAGE="${IMAGE:-survivecity-train}" |
| NAME="gpu-hold-${GPU}" |
|
|
| if ! command -v docker >/dev/null 2>&1; then |
| echo "FATAL: docker not on PATH." >&2 |
| exit 1 |
| fi |
|
|
| if ! docker image inspect "$IMAGE" >/dev/null 2>&1; then |
| echo "FATAL: docker image '$IMAGE' not found. Build it first:" >&2 |
| echo " docker build -f Dockerfile.dgx -t $IMAGE ." >&2 |
| exit 1 |
| fi |
|
|
| if docker ps --filter "name=^${NAME}$" --format '{{.Names}}' | grep -qx "$NAME"; then |
| echo "Already running: $NAME" >&2 |
| docker ps --filter "name=^${NAME}$" |
| exit 0 |
| fi |
|
|
| echo "[gpu_hold] launching $NAME on GPU $GPU (holding ${GB} GB) using image $IMAGE" |
|
|
| |
| |
| |
| |
| |
| SCRIPT_HOST="$(cd "$(dirname "$0")" && pwd)/gpu_hold.py" |
| if [[ -f "$SCRIPT_HOST" ]]; then |
| MOUNT_ARGS=(-v "${SCRIPT_HOST}:/tmp/gpu_hold.py:ro") |
| SCRIPT_PATH="/tmp/gpu_hold.py" |
| else |
| MOUNT_ARGS=() |
| SCRIPT_PATH="/app/scripts/gpu_hold.py" |
| fi |
|
|
| docker run -d --rm \ |
| --name "$NAME" \ |
| --gpus "\"device=${GPU}\"" \ |
| -e GPU_HOLD_GB="$GB" \ |
| "${MOUNT_ARGS[@]}" \ |
| "$IMAGE" \ |
| python "$SCRIPT_PATH" >/dev/null |
|
|
| sleep 4 |
| echo |
| docker logs "$NAME" 2>&1 | tail -5 |
| echo |
| echo "[gpu_hold] running. Stop with: docker rm -f $NAME" |
|
|