File size: 5,307 Bytes
5a1eea4 5cce1b9 5a1eea4 211279e 5a1eea4 211279e 5a1eea4 211279e 5a1eea4 211279e 5a1eea4 211279e 5a1eea4 211279e 5a1eea4 211279e 5a1eea4 211279e 5a1eea4 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 | #!/usr/bin/env python
"""Reserve VRAM on a single GPU so the slot stays ours between training runs.
Allocates a contiguous tensor (default ~30 GB on a 32 GB V100) and idles in a
loop, touching the buffer once a minute so the process stays listed in
`nvidia-smi` and the GPU doesn't drop to idle clocks.
Use only when you actively plan to come back to this GPU (e.g. a follow-up
training run is queued behind a checkpoint pull). On a shared cluster, do NOT hold more than
one GPU at a time, and release it (Ctrl-C / kill) the moment you're done.
Two sizing modes:
GPU_HOLD_GB=N hold a fixed N GB (default 30)
GPU_KEEP_FREE_GB=N hold (free_now - N) GB, leaving exactly N GB for a
co-tenant training process. Wins over GPU_HOLD_GB.
GPU_RELEASE_AFTER=S optional: auto-release after S seconds (useful for tests).
Co-tenant pattern β "reserve a 30 GB window, run training inside it":
# On a V100 (~32 GB total). Leave 14 GB free for training, hold the rest:
KEEP_FREE_GB=14 ./scripts/gpu_hold.sh 3
# Then start training on the SAME GPU index in a separate container:
docker run --rm --gpus '"device=3"' --shm-size=8g \
-e HUGGINGFACE_TOKEN -v $(pwd)/lora_v1:/app/lora_v1 \
survivecity-train python -m training.train ...
# Holder ~16 GB + training ~14 GB = ~30 GB used. Others see GPU as full
# and skip past. If training crashes, holder stays up β slot preserved.
# Just docker run training again; it reclaims the 14 GB.
# When you're done debugging: docker rm -f gpu-hold-3
Tuning rules of thumb:
- First crash with "torch OOM on init"? bump KEEP_FREE_GB up by 2β4 GB.
- Want to be invisible to other users? keep total free < 4 GB.
- V100/32GB Qwen2.5-3B 4-bit + NUM_GENERATIONS=4 + MAX_SEQ=2048 β 12-14 GB.
- V100/32GB Qwen2.5-3B 4-bit + NUM_GENERATIONS=8 + MAX_SEQ=4096 β 18-22 GB.
Usage (on host with torch installed):
CUDA_VISIBLE_DEVICES=3 python scripts/gpu_hold.py
GPU_HOLD_GB=25 CUDA_VISIBLE_DEVICES=3 python scripts/gpu_hold.py
GPU_KEEP_FREE_GB=14 CUDA_VISIBLE_DEVICES=3 python scripts/gpu_hold.py
Usage (Docker, recommended on the DGX β uses the existing image):
./scripts/gpu_hold.sh 3 # hold 30 GB on GPU 3
GB=25 ./scripts/gpu_hold.sh 3 # hold 25 GB
KEEP_FREE_GB=14 ./scripts/gpu_hold.sh 3 # leave 14 GB free for training
Stop:
Ctrl-C if foreground, or `docker rm -f gpu-hold-N` for the docker variant.
"""
from __future__ import annotations
import datetime
import os
import sys
import time
import torch
def main():
if not torch.cuda.is_available():
print("ERROR: torch.cuda.is_available() is False β nothing to hold.", file=sys.stderr)
sys.exit(1)
name = torch.cuda.get_device_name(0)
cap = torch.cuda.get_device_capability(0)
free_b, total_b = torch.cuda.mem_get_info(0)
free_gb = free_b / (1024 ** 3)
keep_free = os.environ.get("GPU_KEEP_FREE_GB")
if keep_free is not None and keep_free.strip() != "":
keep_free_gb = float(keep_free)
# Leave keep_free_gb in the pool; grab the rest. Subtract a small
# cushion (~0.5 GB) so cudaMalloc has room to maneuver.
gb = max(1.0, free_gb - keep_free_gb - 0.5)
mode = "keep_free={0:.1f}GB".format(keep_free_gb)
else:
gb = float(os.environ.get("GPU_HOLD_GB", "30"))
mode = "fixed"
elements = int(gb * (1024 ** 3) / 4)
print(
"[{0}] device={1} cc={2}.{3} free={4:.1f}GB total={5:.1f}GB mode={6} requested={7:.1f}GB pid={8}".format(
datetime.datetime.now().isoformat(timespec="seconds"),
name, cap[0], cap[1],
free_gb, total_b / (1024 ** 3), mode, gb, os.getpid(),
)
)
sys.stdout.flush()
try:
x = torch.empty(elements, dtype=torch.float32, device="cuda:0")
except RuntimeError as e:
print("ERROR: allocation failed: {0}".format(e), file=sys.stderr)
print(" try a smaller GPU_HOLD_GB (e.g. 25 or 20).", file=sys.stderr)
sys.exit(2)
x.fill_(0)
torch.cuda.synchronize()
free_after, _ = torch.cuda.mem_get_info(0)
print(
"[{0}] holding {1:.1f}GB; free now {2:.1f}GB. Touching every 60s.".format(
datetime.datetime.now().isoformat(timespec="seconds"),
x.element_size() * x.nelement() / 1e9,
free_after / 1e9,
)
)
sys.stdout.flush()
release_after = os.environ.get("GPU_RELEASE_AFTER")
deadline = None
if release_after and release_after.strip():
deadline = time.time() + float(release_after)
# Heartbeat loop. add_(0) is a no-op math but keeps the process active
# against idle reapers and shows up as compute usage in nvidia-smi.
try:
while True:
if deadline is not None and time.time() > deadline:
print("[{0}] GPU_RELEASE_AFTER reached, releasing.".format(
datetime.datetime.now().isoformat(timespec="seconds")))
break
x.add_(0)
torch.cuda.synchronize()
time.sleep(60)
except KeyboardInterrupt:
print("\n[{0}] released.".format(datetime.datetime.now().isoformat(timespec="seconds")))
if __name__ == "__main__":
main()
|