#!/usr/bin/env python
"""Reserve VRAM on a single GPU so the slot stays ours between training runs.

Allocates a contiguous tensor (default ~30 GB on a 32 GB V100) and idles in a
loop, touching the buffer once a minute so the process stays listed in
`nvidia-smi` and the GPU doesn't drop to idle clocks.

Use only when you actively plan to come back to this GPU (e.g. a follow-up
training run is queued behind a checkpoint pull). On a shared cluster, do NOT hold more than
one GPU at a time, and release it (Ctrl-C / kill) the moment you're done.

Two sizing modes:

  GPU_HOLD_GB=N        hold a fixed N GB (default 30)
  GPU_KEEP_FREE_GB=N   hold (free_now - N) GB, leaving exactly N GB for a
                       co-tenant training process. Wins over GPU_HOLD_GB.
  GPU_RELEASE_AFTER=S  optional: auto-release after S seconds (useful for tests).

Co-tenant pattern — "reserve a 30 GB window, run training inside it":

    # On a V100 (~32 GB total). Leave 14 GB free for training, hold the rest:
    KEEP_FREE_GB=14 ./scripts/gpu_hold.sh 3
    # Then start training on the SAME GPU index in a separate container:
    docker run --rm --gpus '"device=3"' --shm-size=8g \
        -e HUGGINGFACE_TOKEN -v $(pwd)/lora_v1:/app/lora_v1 \
        survivecity-train python -m training.train ...
    # Holder ~16 GB + training ~14 GB = ~30 GB used. Others see GPU as full
    # and skip past. If training crashes, holder stays up → slot preserved.
    # Just docker run training again; it reclaims the 14 GB.
    # When you're done debugging: docker rm -f gpu-hold-3

Tuning rules of thumb:

  - First crash with "torch OOM on init"?  bump KEEP_FREE_GB up by 2–4 GB.
  - Want to be invisible to other users?   keep total free < 4 GB.
  - V100/32GB Qwen2.5-3B 4-bit + NUM_GENERATIONS=4 + MAX_SEQ=2048 ≈ 12-14 GB.
  - V100/32GB Qwen2.5-3B 4-bit + NUM_GENERATIONS=8 + MAX_SEQ=4096 ≈ 18-22 GB.

Usage (on host with torch installed):
    CUDA_VISIBLE_DEVICES=3 python scripts/gpu_hold.py
    GPU_HOLD_GB=25 CUDA_VISIBLE_DEVICES=3 python scripts/gpu_hold.py
    GPU_KEEP_FREE_GB=14 CUDA_VISIBLE_DEVICES=3 python scripts/gpu_hold.py

Usage (Docker, recommended on the DGX — uses the existing image):
    ./scripts/gpu_hold.sh 3                   # hold 30 GB on GPU 3
    GB=25 ./scripts/gpu_hold.sh 3             # hold 25 GB
    KEEP_FREE_GB=14 ./scripts/gpu_hold.sh 3   # leave 14 GB free for training

Stop:
    Ctrl-C if foreground, or `docker rm -f gpu-hold-N` for the docker variant.
"""

from __future__ import annotations

import datetime
import os
import sys
import time

import torch


def main():
    if not torch.cuda.is_available():
        print("ERROR: torch.cuda.is_available() is False — nothing to hold.", file=sys.stderr)
        sys.exit(1)

    name = torch.cuda.get_device_name(0)
    cap = torch.cuda.get_device_capability(0)
    free_b, total_b = torch.cuda.mem_get_info(0)
    free_gb = free_b / (1024 ** 3)

    keep_free = os.environ.get("GPU_KEEP_FREE_GB")
    if keep_free is not None and keep_free.strip() != "":
        keep_free_gb = float(keep_free)
        # Leave keep_free_gb in the pool; grab the rest. Subtract a small
        # cushion (~0.5 GB) so cudaMalloc has room to maneuver.
        gb = max(1.0, free_gb - keep_free_gb - 0.5)
        mode = "keep_free={0:.1f}GB".format(keep_free_gb)
    else:
        gb = float(os.environ.get("GPU_HOLD_GB", "30"))
        mode = "fixed"

    elements = int(gb * (1024 ** 3) / 4)
    print(
        "[{0}] device={1} cc={2}.{3} free={4:.1f}GB total={5:.1f}GB mode={6} requested={7:.1f}GB pid={8}".format(
            datetime.datetime.now().isoformat(timespec="seconds"),
            name, cap[0], cap[1],
            free_gb, total_b / (1024 ** 3), mode, gb, os.getpid(),
        )
    )
    sys.stdout.flush()

    try:
        x = torch.empty(elements, dtype=torch.float32, device="cuda:0")
    except RuntimeError as e:
        print("ERROR: allocation failed: {0}".format(e), file=sys.stderr)
        print("       try a smaller GPU_HOLD_GB (e.g. 25 or 20).", file=sys.stderr)
        sys.exit(2)
    x.fill_(0)
    torch.cuda.synchronize()

    free_after, _ = torch.cuda.mem_get_info(0)
    print(
        "[{0}] holding {1:.1f}GB; free now {2:.1f}GB. Touching every 60s.".format(
            datetime.datetime.now().isoformat(timespec="seconds"),
            x.element_size() * x.nelement() / 1e9,
            free_after / 1e9,
        )
    )
    sys.stdout.flush()

    release_after = os.environ.get("GPU_RELEASE_AFTER")
    deadline = None
    if release_after and release_after.strip():
        deadline = time.time() + float(release_after)

    # Heartbeat loop. add_(0) is a no-op math but keeps the process active
    # against idle reapers and shows up as compute usage in nvidia-smi.
    try:
        while True:
            if deadline is not None and time.time() > deadline:
                print("[{0}] GPU_RELEASE_AFTER reached, releasing.".format(
                    datetime.datetime.now().isoformat(timespec="seconds")))
                break
            x.add_(0)
            torch.cuda.synchronize()
            time.sleep(60)
    except KeyboardInterrupt:
        print("\n[{0}] released.".format(datetime.datetime.now().isoformat(timespec="seconds")))


if __name__ == "__main__":
    main()