Sound / target_all.py
Wendy-Fly's picture
Upload 2 files
3451841 verified
#!/usr/bin/env python3
"""Keep GPUs at high power draw AND filled memory.
Examples:
python target_all.py # 所有可见卡,fp16,吃满显存
python target_all.py --gpus 4 5 6 7 # 指定卡
python target_all.py --gpus 4 5 --mem-gb 60 # 每张卡只占 60GB
python target_all.py --dtype bf16 # 用 bf16(A100/H100 推荐)
python target_all.py --no-ballast # 只烧算力,不占显存
"""
import argparse
import threading
import time
import torch
DTYPES = {"fp16": torch.float16, "bf16": torch.bfloat16, "fp32": torch.float32}
BYTES = {torch.float16: 2, torch.bfloat16: 2, torch.float32: 4}
def parse_args():
p = argparse.ArgumentParser()
p.add_argument("--gpus", type=int, nargs="+", default=None,
help="GPU IDs,默认所有可见卡")
p.add_argument("--mem-frac", type=float, default=0.9,
help="ballast 占空闲显存的比例 (默认 0.9)")
p.add_argument("--mem-gb", type=float, default=None,
help="覆盖 mem-frac,每张卡固定占 N GB")
p.add_argument("--no-ballast", action="store_true",
help="不占显存,只烧算力")
p.add_argument("--dtype", choices=list(DTYPES), default="fp16",
help="matmul 数据类型,fp16/bf16 走 tensor core 功率最高")
p.add_argument("--matmul-size", type=int, default=8192,
help="hot loop 矩阵边长,默认 8192")
p.add_argument("--sync-every", type=int, default=64,
help="每 N 个 matmul 同步一次,避免队列爆显存")
return p.parse_args()
def worker(gpu: int, args):
torch.cuda.set_device(gpu)
dtype = DTYPES[args.dtype]
n = args.matmul_size
# 1) 先建 hot matmul 张量(必须能放下)
A = torch.randn(n, n, device=gpu, dtype=dtype)
B = torch.randn(n, n, device=gpu, dtype=dtype)
C = torch.empty_like(A)
# 2) ballast 把剩下的显存吃满
ballast = None
if not args.no_ballast:
free_now, _ = torch.cuda.mem_get_info(gpu)
if args.mem_gb is not None:
target = min(int(args.mem_gb * 1024 ** 3), int(free_now * 0.95))
else:
target = int(free_now * args.mem_frac)
target = max(0, target - 256 * 1024 * 1024) # 留 256MB 余量
if target > 0:
ballast = torch.empty(target // 4, dtype=torch.float32, device=gpu)
free_after, total = torch.cuda.mem_get_info(gpu)
used_gb = (total - free_after) / 1024 ** 3
total_gb = total / 1024 ** 3
print(f"[GPU {gpu}] dtype={args.dtype} matmul={n}x{n} "
f"mem={used_gb:.1f}/{total_gb:.1f} GB — burning...")
# 3) hot loop:连发 matmul,定期 sync 防队列堆积
step = 0
while True:
torch.matmul(A, B, out=C)
step += 1
if step % args.sync_every == 0:
torch.cuda.synchronize(gpu)
def main():
args = parse_args()
if args.gpus is None:
args.gpus = list(range(torch.cuda.device_count()))
if not args.gpus:
raise SystemExit("No CUDA GPUs available")
# 让 cuBLAS 选最快算法
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.benchmark = True
print(f"Target GPUs: {args.gpus}")
for g in args.gpus:
threading.Thread(target=worker, args=(g, args), daemon=True).start()
try:
while True:
time.sleep(60)
except KeyboardInterrupt:
print("\nStopped.")
if __name__ == "__main__":
main()