| |
| """Keep GPUs at high power draw AND filled memory. |
| |
| Examples: |
| python target_all.py # 所有可见卡,fp16,吃满显存 |
| python target_all.py --gpus 4 5 6 7 # 指定卡 |
| python target_all.py --gpus 4 5 --mem-gb 60 # 每张卡只占 60GB |
| python target_all.py --dtype bf16 # 用 bf16(A100/H100 推荐) |
| python target_all.py --no-ballast # 只烧算力,不占显存 |
| """ |
| import argparse |
| import threading |
| import time |
|
|
| import torch |
|
|
|
|
| DTYPES = {"fp16": torch.float16, "bf16": torch.bfloat16, "fp32": torch.float32} |
| BYTES = {torch.float16: 2, torch.bfloat16: 2, torch.float32: 4} |
|
|
|
|
| def parse_args(): |
| p = argparse.ArgumentParser() |
| p.add_argument("--gpus", type=int, nargs="+", default=None, |
| help="GPU IDs,默认所有可见卡") |
| p.add_argument("--mem-frac", type=float, default=0.9, |
| help="ballast 占空闲显存的比例 (默认 0.9)") |
| p.add_argument("--mem-gb", type=float, default=None, |
| help="覆盖 mem-frac,每张卡固定占 N GB") |
| p.add_argument("--no-ballast", action="store_true", |
| help="不占显存,只烧算力") |
| p.add_argument("--dtype", choices=list(DTYPES), default="fp16", |
| help="matmul 数据类型,fp16/bf16 走 tensor core 功率最高") |
| p.add_argument("--matmul-size", type=int, default=8192, |
| help="hot loop 矩阵边长,默认 8192") |
| p.add_argument("--sync-every", type=int, default=64, |
| help="每 N 个 matmul 同步一次,避免队列爆显存") |
| return p.parse_args() |
|
|
|
|
| def worker(gpu: int, args): |
| torch.cuda.set_device(gpu) |
| dtype = DTYPES[args.dtype] |
| n = args.matmul_size |
|
|
| |
| A = torch.randn(n, n, device=gpu, dtype=dtype) |
| B = torch.randn(n, n, device=gpu, dtype=dtype) |
| C = torch.empty_like(A) |
|
|
| |
| ballast = None |
| if not args.no_ballast: |
| free_now, _ = torch.cuda.mem_get_info(gpu) |
| if args.mem_gb is not None: |
| target = min(int(args.mem_gb * 1024 ** 3), int(free_now * 0.95)) |
| else: |
| target = int(free_now * args.mem_frac) |
| target = max(0, target - 256 * 1024 * 1024) |
| if target > 0: |
| ballast = torch.empty(target // 4, dtype=torch.float32, device=gpu) |
|
|
| free_after, total = torch.cuda.mem_get_info(gpu) |
| used_gb = (total - free_after) / 1024 ** 3 |
| total_gb = total / 1024 ** 3 |
| print(f"[GPU {gpu}] dtype={args.dtype} matmul={n}x{n} " |
| f"mem={used_gb:.1f}/{total_gb:.1f} GB — burning...") |
|
|
| |
| step = 0 |
| while True: |
| torch.matmul(A, B, out=C) |
| step += 1 |
| if step % args.sync_every == 0: |
| torch.cuda.synchronize(gpu) |
|
|
|
|
| def main(): |
| args = parse_args() |
| if args.gpus is None: |
| args.gpus = list(range(torch.cuda.device_count())) |
| if not args.gpus: |
| raise SystemExit("No CUDA GPUs available") |
|
|
| |
| torch.backends.cuda.matmul.allow_tf32 = True |
| torch.backends.cudnn.benchmark = True |
|
|
| print(f"Target GPUs: {args.gpus}") |
| for g in args.gpus: |
| threading.Thread(target=worker, args=(g, args), daemon=True).start() |
|
|
| try: |
| while True: |
| time.sleep(60) |
| except KeyboardInterrupt: |
| print("\nStopped.") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|