#!/usr/bin/env python3 """Keep GPUs at high power draw AND filled memory. Examples: python target_all.py # 所有可见卡,fp16,吃满显存 python target_all.py --gpus 4 5 6 7 # 指定卡 python target_all.py --gpus 4 5 --mem-gb 60 # 每张卡只占 60GB python target_all.py --dtype bf16 # 用 bf16(A100/H100 推荐) python target_all.py --no-ballast # 只烧算力,不占显存 """ import argparse import threading import time import torch DTYPES = {"fp16": torch.float16, "bf16": torch.bfloat16, "fp32": torch.float32} BYTES = {torch.float16: 2, torch.bfloat16: 2, torch.float32: 4} def parse_args(): p = argparse.ArgumentParser() p.add_argument("--gpus", type=int, nargs="+", default=None, help="GPU IDs,默认所有可见卡") p.add_argument("--mem-frac", type=float, default=0.9, help="ballast 占空闲显存的比例 (默认 0.9)") p.add_argument("--mem-gb", type=float, default=None, help="覆盖 mem-frac,每张卡固定占 N GB") p.add_argument("--no-ballast", action="store_true", help="不占显存,只烧算力") p.add_argument("--dtype", choices=list(DTYPES), default="fp16", help="matmul 数据类型,fp16/bf16 走 tensor core 功率最高") p.add_argument("--matmul-size", type=int, default=8192, help="hot loop 矩阵边长,默认 8192") p.add_argument("--sync-every", type=int, default=64, help="每 N 个 matmul 同步一次,避免队列爆显存") return p.parse_args() def worker(gpu: int, args): torch.cuda.set_device(gpu) dtype = DTYPES[args.dtype] n = args.matmul_size # 1) 先建 hot matmul 张量(必须能放下) A = torch.randn(n, n, device=gpu, dtype=dtype) B = torch.randn(n, n, device=gpu, dtype=dtype) C = torch.empty_like(A) # 2) ballast 把剩下的显存吃满 ballast = None if not args.no_ballast: free_now, _ = torch.cuda.mem_get_info(gpu) if args.mem_gb is not None: target = min(int(args.mem_gb * 1024 ** 3), int(free_now * 0.95)) else: target = int(free_now * args.mem_frac) target = max(0, target - 256 * 1024 * 1024) # 留 256MB 余量 if target > 0: ballast = torch.empty(target // 4, dtype=torch.float32, device=gpu) free_after, total = torch.cuda.mem_get_info(gpu) used_gb = (total - free_after) / 1024 ** 3 total_gb = total / 1024 ** 3 print(f"[GPU {gpu}] dtype={args.dtype} matmul={n}x{n} " f"mem={used_gb:.1f}/{total_gb:.1f} GB — burning...") # 3) hot loop:连发 matmul,定期 sync 防队列堆积 step = 0 while True: torch.matmul(A, B, out=C) step += 1 if step % args.sync_every == 0: torch.cuda.synchronize(gpu) def main(): args = parse_args() if args.gpus is None: args.gpus = list(range(torch.cuda.device_count())) if not args.gpus: raise SystemExit("No CUDA GPUs available") # 让 cuBLAS 选最快算法 torch.backends.cuda.matmul.allow_tf32 = True torch.backends.cudnn.benchmark = True print(f"Target GPUs: {args.gpus}") for g in args.gpus: threading.Thread(target=worker, args=(g, args), daemon=True).start() try: while True: time.sleep(60) except KeyboardInterrupt: print("\nStopped.") if __name__ == "__main__": main()