#!/usr/bin/env python3
"""Keep GPUs at high power draw AND filled memory.

Examples:
  python target_all.py                              # 所有可见卡，fp16，吃满显存
  python target_all.py --gpus 4 5 6 7               # 指定卡
  python target_all.py --gpus 4 5 --mem-gb 60       # 每张卡只占 60GB
  python target_all.py --dtype bf16                 # 用 bf16（A100/H100 推荐）
  python target_all.py --no-ballast                 # 只烧算力，不占显存
"""
import argparse
import threading
import time

import torch


DTYPES = {"fp16": torch.float16, "bf16": torch.bfloat16, "fp32": torch.float32}
BYTES = {torch.float16: 2, torch.bfloat16: 2, torch.float32: 4}


def parse_args():
    p = argparse.ArgumentParser()
    p.add_argument("--gpus", type=int, nargs="+", default=None,
                   help="GPU IDs，默认所有可见卡")
    p.add_argument("--mem-frac", type=float, default=0.9,
                   help="ballast 占空闲显存的比例 (默认 0.9)")
    p.add_argument("--mem-gb", type=float, default=None,
                   help="覆盖 mem-frac，每张卡固定占 N GB")
    p.add_argument("--no-ballast", action="store_true",
                   help="不占显存，只烧算力")
    p.add_argument("--dtype", choices=list(DTYPES), default="fp16",
                   help="matmul 数据类型，fp16/bf16 走 tensor core 功率最高")
    p.add_argument("--matmul-size", type=int, default=8192,
                   help="hot loop 矩阵边长，默认 8192")
    p.add_argument("--sync-every", type=int, default=64,
                   help="每 N 个 matmul 同步一次，避免队列爆显存")
    return p.parse_args()


def worker(gpu: int, args):
    torch.cuda.set_device(gpu)
    dtype = DTYPES[args.dtype]
    n = args.matmul_size

    # 1) 先建 hot matmul 张量（必须能放下）
    A = torch.randn(n, n, device=gpu, dtype=dtype)
    B = torch.randn(n, n, device=gpu, dtype=dtype)
    C = torch.empty_like(A)

    # 2) ballast 把剩下的显存吃满
    ballast = None
    if not args.no_ballast:
        free_now, _ = torch.cuda.mem_get_info(gpu)
        if args.mem_gb is not None:
            target = min(int(args.mem_gb * 1024 ** 3), int(free_now * 0.95))
        else:
            target = int(free_now * args.mem_frac)
        target = max(0, target - 256 * 1024 * 1024)  # 留 256MB 余量
        if target > 0:
            ballast = torch.empty(target // 4, dtype=torch.float32, device=gpu)

    free_after, total = torch.cuda.mem_get_info(gpu)
    used_gb = (total - free_after) / 1024 ** 3
    total_gb = total / 1024 ** 3
    print(f"[GPU {gpu}] dtype={args.dtype} matmul={n}x{n} "
          f"mem={used_gb:.1f}/{total_gb:.1f} GB — burning...")

    # 3) hot loop：连发 matmul，定期 sync 防队列堆积
    step = 0
    while True:
        torch.matmul(A, B, out=C)
        step += 1
        if step % args.sync_every == 0:
            torch.cuda.synchronize(gpu)


def main():
    args = parse_args()
    if args.gpus is None:
        args.gpus = list(range(torch.cuda.device_count()))
    if not args.gpus:
        raise SystemExit("No CUDA GPUs available")

    # 让 cuBLAS 选最快算法
    torch.backends.cuda.matmul.allow_tf32 = True
    torch.backends.cudnn.benchmark = True

    print(f"Target GPUs: {args.gpus}")
    for g in args.gpus:
        threading.Thread(target=worker, args=(g, args), daemon=True).start()

    try:
        while True:
            time.sleep(60)
    except KeyboardInterrupt:
        print("\nStopped.")


if __name__ == "__main__":
    main()