Spaces:

noanya
/

zombiee

Running

App Files Files Community

zombiee / scripts /gpu_hold.py

EeshanSingh

commit file change

636ec72 23 days ago

raw

history blame contribute delete

5.31 kB

	#!/usr/bin/env python
	"""Reserve VRAM on a single GPU so the slot stays ours between training runs.

	Allocates a contiguous tensor (default ~30 GB on a 32 GB V100) and idles in a
	loop, touching the buffer once a minute so the process stays listed in
	`nvidia-smi` and the GPU doesn't drop to idle clocks.

	Use only when you actively plan to come back to this GPU (e.g. a follow-up
	training run is queued behind a checkpoint pull). On a shared cluster, do NOT hold more than
	one GPU at a time, and release it (Ctrl-C / kill) the moment you're done.

	Two sizing modes:

	GPU_HOLD_GB=N hold a fixed N GB (default 30)
	GPU_KEEP_FREE_GB=N hold (free_now - N) GB, leaving exactly N GB for a
	co-tenant training process. Wins over GPU_HOLD_GB.
	GPU_RELEASE_AFTER=S optional: auto-release after S seconds (useful for tests).

	Co-tenant pattern — "reserve a 30 GB window, run training inside it":

	# On a V100 (~32 GB total). Leave 14 GB free for training, hold the rest:
	KEEP_FREE_GB=14 ./scripts/gpu_hold.sh 3
	# Then start training on the SAME GPU index in a separate container:
	docker run --rm --gpus '"device=3"' --shm-size=8g \
	-e HUGGINGFACE_TOKEN -v $(pwd)/lora_v1:/app/lora_v1 \
	survivecity-train python -m training.train ...
	# Holder ~16 GB + training ~14 GB = ~30 GB used. Others see GPU as full
	# and skip past. If training crashes, holder stays up → slot preserved.
	# Just docker run training again; it reclaims the 14 GB.
	# When you're done debugging: docker rm -f gpu-hold-3

	Tuning rules of thumb:

	- First crash with "torch OOM on init"? bump KEEP_FREE_GB up by 2–4 GB.
	- Want to be invisible to other users? keep total free < 4 GB.
	- V100/32GB Qwen2.5-3B 4-bit + NUM_GENERATIONS=4 + MAX_SEQ=2048 ≈ 12-14 GB.
	- V100/32GB Qwen2.5-3B 4-bit + NUM_GENERATIONS=8 + MAX_SEQ=4096 ≈ 18-22 GB.

	Usage (on host with torch installed):
	CUDA_VISIBLE_DEVICES=3 python scripts/gpu_hold.py
	GPU_HOLD_GB=25 CUDA_VISIBLE_DEVICES=3 python scripts/gpu_hold.py
	GPU_KEEP_FREE_GB=14 CUDA_VISIBLE_DEVICES=3 python scripts/gpu_hold.py

	Usage (Docker, recommended on the DGX — uses the existing image):
	./scripts/gpu_hold.sh 3 # hold 30 GB on GPU 3
	GB=25 ./scripts/gpu_hold.sh 3 # hold 25 GB
	KEEP_FREE_GB=14 ./scripts/gpu_hold.sh 3 # leave 14 GB free for training

	Stop:
	Ctrl-C if foreground, or `docker rm -f gpu-hold-N` for the docker variant.
	"""

	from __future__ import annotations

	import datetime
	import os
	import sys
	import time

	import torch


	def main():
	if not torch.cuda.is_available():
	print("ERROR: torch.cuda.is_available() is False — nothing to hold.", file=sys.stderr)
	sys.exit(1)

	name = torch.cuda.get_device_name(0)
	cap = torch.cuda.get_device_capability(0)
	free_b, total_b = torch.cuda.mem_get_info(0)
	free_gb = free_b / (1024 ** 3)

	keep_free = os.environ.get("GPU_KEEP_FREE_GB")
	if keep_free is not None and keep_free.strip() != "":
	keep_free_gb = float(keep_free)
	# Leave keep_free_gb in the pool; grab the rest. Subtract a small
	# cushion (~0.5 GB) so cudaMalloc has room to maneuver.
	gb = max(1.0, free_gb - keep_free_gb - 0.5)
	mode = "keep_free={0:.1f}GB".format(keep_free_gb)
	else:
	gb = float(os.environ.get("GPU_HOLD_GB", "30"))
	mode = "fixed"

	elements = int(gb * (1024 ** 3) / 4)
	print(
	"[{0}] device={1} cc={2}.{3} free={4:.1f}GB total={5:.1f}GB mode={6} requested={7:.1f}GB pid={8}".format(
	datetime.datetime.now().isoformat(timespec="seconds"),
	name, cap[0], cap[1],
	free_gb, total_b / (1024 ** 3), mode, gb, os.getpid(),
	)
	)
	sys.stdout.flush()

	try:
	x = torch.empty(elements, dtype=torch.float32, device="cuda:0")
	except RuntimeError as e:
	print("ERROR: allocation failed: {0}".format(e), file=sys.stderr)
	print(" try a smaller GPU_HOLD_GB (e.g. 25 or 20).", file=sys.stderr)
	sys.exit(2)
	x.fill_(0)
	torch.cuda.synchronize()

	free_after, _ = torch.cuda.mem_get_info(0)
	print(
	"[{0}] holding {1:.1f}GB; free now {2:.1f}GB. Touching every 60s.".format(
	datetime.datetime.now().isoformat(timespec="seconds"),
	x.element_size() * x.nelement() / 1e9,
	free_after / 1e9,
	)
	)
	sys.stdout.flush()

	release_after = os.environ.get("GPU_RELEASE_AFTER")
	deadline = None
	if release_after and release_after.strip():
	deadline = time.time() + float(release_after)

	# Heartbeat loop. add_(0) is a no-op math but keeps the process active
	# against idle reapers and shows up as compute usage in nvidia-smi.
	try:
	while True:
	if deadline is not None and time.time() > deadline:
	print("[{0}] GPU_RELEASE_AFTER reached, releasing.".format(
	datetime.datetime.now().isoformat(timespec="seconds")))
	break
	x.add_(0)
	torch.cuda.synchronize()
	time.sleep(60)
	except KeyboardInterrupt:
	print("\n[{0}] released.".format(datetime.datetime.now().isoformat(timespec="seconds")))


	if __name__ == "__main__":
	main()