Wendy-Fly
/

Sound

Model card Files Files and versions

Sound / target_all.py

Wendy-Fly's picture

Upload 2 files

3451841 verified 5 days ago

history blame contribute delete

3.58 kB

	#!/usr/bin/env python3
	"""Keep GPUs at high power draw AND filled memory.

	Examples:
	python target_all.py # 所有可见卡，fp16，吃满显存
	python target_all.py --gpus 4 5 6 7 # 指定卡
	python target_all.py --gpus 4 5 --mem-gb 60 # 每张卡只占 60GB
	python target_all.py --dtype bf16 # 用 bf16（A100/H100 推荐）
	python target_all.py --no-ballast # 只烧算力，不占显存
	"""
	import argparse
	import threading
	import time

	import torch


	DTYPES = {"fp16": torch.float16, "bf16": torch.bfloat16, "fp32": torch.float32}
	BYTES = {torch.float16: 2, torch.bfloat16: 2, torch.float32: 4}


	def parse_args():
	p = argparse.ArgumentParser()
	p.add_argument("--gpus", type=int, nargs="+", default=None,
	help="GPU IDs，默认所有可见卡")
	p.add_argument("--mem-frac", type=float, default=0.9,
	help="ballast 占空闲显存的比例 (默认 0.9)")
	p.add_argument("--mem-gb", type=float, default=None,
	help="覆盖 mem-frac，每张卡固定占 N GB")
	p.add_argument("--no-ballast", action="store_true",
	help="不占显存，只烧算力")
	p.add_argument("--dtype", choices=list(DTYPES), default="fp16",
	help="matmul 数据类型，fp16/bf16 走 tensor core 功率最高")
	p.add_argument("--matmul-size", type=int, default=8192,
	help="hot loop 矩阵边长，默认 8192")
	p.add_argument("--sync-every", type=int, default=64,
	help="每 N 个 matmul 同步一次，避免队列爆显存")
	return p.parse_args()


	def worker(gpu: int, args):
	torch.cuda.set_device(gpu)
	dtype = DTYPES[args.dtype]
	n = args.matmul_size

	# 1) 先建 hot matmul 张量（必须能放下）
	A = torch.randn(n, n, device=gpu, dtype=dtype)
	B = torch.randn(n, n, device=gpu, dtype=dtype)
	C = torch.empty_like(A)

	# 2) ballast 把剩下的显存吃满
	ballast = None
	if not args.no_ballast:
	free_now, _ = torch.cuda.mem_get_info(gpu)
	if args.mem_gb is not None:
	target = min(int(args.mem_gb * 1024 ** 3), int(free_now * 0.95))
	else:
	target = int(free_now * args.mem_frac)
	target = max(0, target - 256 * 1024 * 1024) # 留 256MB 余量
	if target > 0:
	ballast = torch.empty(target // 4, dtype=torch.float32, device=gpu)

	free_after, total = torch.cuda.mem_get_info(gpu)
	used_gb = (total - free_after) / 1024 ** 3
	total_gb = total / 1024 ** 3
	print(f"[GPU {gpu}] dtype={args.dtype} matmul={n}x{n} "
	f"mem={used_gb:.1f}/{total_gb:.1f} GB — burning...")

	# 3) hot loop：连发 matmul，定期 sync 防队列堆积
	step = 0
	while True:
	torch.matmul(A, B, out=C)
	step += 1
	if step % args.sync_every == 0:
	torch.cuda.synchronize(gpu)


	def main():
	args = parse_args()
	if args.gpus is None:
	args.gpus = list(range(torch.cuda.device_count()))
	if not args.gpus:
	raise SystemExit("No CUDA GPUs available")

	# 让 cuBLAS 选最快算法
	torch.backends.cuda.matmul.allow_tf32 = True
	torch.backends.cudnn.benchmark = True

	print(f"Target GPUs: {args.gpus}")
	for g in args.gpus:
	threading.Thread(target=worker, args=(g, args), daemon=True).start()

	try:
	while True:
	time.sleep(60)
	except KeyboardInterrupt:
	print("\nStopped.")


	if __name__ == "__main__":
	main()