Upload 2 files
Browse files- infer_and_wait_all.sh +87 -0
- target_all.py +101 -0
infer_and_wait_all.sh
ADDED
|
@@ -0,0 +1,87 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/bin/bash
|
| 2 |
+
# Usage:
|
| 3 |
+
# ./infer_and_wait_all.sh # 默认占后 4 张卡
|
| 4 |
+
# ./infer_and_wait_all.sh 0 1 2 3 # 占指定卡
|
| 5 |
+
# ./infer_and_wait_all.sh -s 70 4 5 # 每张卡占 70GB
|
| 6 |
+
# ./infer_and_wait_all.sh -w 50000 0 # 等到该卡空闲 >50GB 再占
|
| 7 |
+
#
|
| 8 |
+
# Flags:
|
| 9 |
+
# -s, --size 每张卡要占的显存大小 (GB),默认尽量占满 (95% 空闲)
|
| 10 |
+
# -w, --wait 等待阈值 (MB),达到后再占,默认不等
|
| 11 |
+
# -i, --interval 保活刷新间隔 (s),默认 0.5
|
| 12 |
+
set -e
|
| 13 |
+
|
| 14 |
+
SIZE_GB="" # 空 = 自动占 95% 空闲
|
| 15 |
+
WAIT_MB=0
|
| 16 |
+
INTERVAL=0.5
|
| 17 |
+
|
| 18 |
+
while [[ "$1" =~ ^- ]]; do
|
| 19 |
+
case "$1" in
|
| 20 |
+
-s|--size) SIZE_GB="$2"; shift 2 ;;
|
| 21 |
+
-w|--wait) WAIT_MB="$2"; shift 2 ;;
|
| 22 |
+
-i|--interval) INTERVAL="$2"; shift 2 ;;
|
| 23 |
+
-h|--help) sed -n '2,12p' "$0"; exit 0 ;;
|
| 24 |
+
*) echo "Unknown option: $1"; exit 1 ;;
|
| 25 |
+
esac
|
| 26 |
+
done
|
| 27 |
+
|
| 28 |
+
# 决定要占的 GPU
|
| 29 |
+
if [ $# -eq 0 ]; then
|
| 30 |
+
TOTAL=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
|
| 31 |
+
if [ "$TOTAL" -le 4 ]; then
|
| 32 |
+
GPU_IDS=$(seq 0 $((TOTAL-1)))
|
| 33 |
+
else
|
| 34 |
+
GPU_IDS=$(seq $((TOTAL-4)) $((TOTAL-1)))
|
| 35 |
+
fi
|
| 36 |
+
else
|
| 37 |
+
GPU_IDS="$@"
|
| 38 |
+
fi
|
| 39 |
+
GPU_LIST=$(echo $GPU_IDS | tr ' ' ',')
|
| 40 |
+
echo "Target GPUs: $GPU_LIST"
|
| 41 |
+
|
| 42 |
+
# 可选:等待显存空闲
|
| 43 |
+
if [ "$WAIT_MB" -gt 0 ]; then
|
| 44 |
+
echo "Waiting until each GPU has >= ${WAIT_MB} MB free..."
|
| 45 |
+
while true; do
|
| 46 |
+
ALL_OK=1
|
| 47 |
+
for g in $GPU_IDS; do
|
| 48 |
+
FREE=$(nvidia-smi --query-gpu=memory.free --format=csv,nounits,noheader | sed -n "$((g+1))p")
|
| 49 |
+
echo " GPU $g free: ${FREE} MB"
|
| 50 |
+
[ "$FREE" -lt "$WAIT_MB" ] && ALL_OK=0
|
| 51 |
+
done
|
| 52 |
+
[ "$ALL_OK" -eq 1 ] && break
|
| 53 |
+
sleep 10
|
| 54 |
+
done
|
| 55 |
+
fi
|
| 56 |
+
|
| 57 |
+
# 占卡
|
| 58 |
+
export CUDA_VISIBLE_DEVICES=$GPU_LIST
|
| 59 |
+
python - <<EOF
|
| 60 |
+
import torch, time, os
|
| 61 |
+
gpu_ids = os.environ["CUDA_VISIBLE_DEVICES"].split(",")
|
| 62 |
+
n = torch.cuda.device_count()
|
| 63 |
+
size_gb = "${SIZE_GB}"
|
| 64 |
+
size_gb = float(size_gb) if size_gb else None
|
| 65 |
+
interval = float("${INTERVAL}")
|
| 66 |
+
|
| 67 |
+
tensors = []
|
| 68 |
+
for i in range(n):
|
| 69 |
+
free, total = torch.cuda.mem_get_info(i)
|
| 70 |
+
if size_gb is None:
|
| 71 |
+
target = int(free * 0.95)
|
| 72 |
+
else:
|
| 73 |
+
target = min(int(size_gb * 1024**3), int(free * 0.95))
|
| 74 |
+
elems = target // 4 # float32
|
| 75 |
+
t = torch.empty(elems, dtype=torch.float32, device=f"cuda:{i}")
|
| 76 |
+
tensors.append(t)
|
| 77 |
+
print(f"GPU {gpu_ids[i]} -> occupied {target/1024**3:.2f} GB / free was {free/1024**3:.2f} GB")
|
| 78 |
+
|
| 79 |
+
print(f"All {n} GPU(s) occupied. Ctrl+C to release.")
|
| 80 |
+
try:
|
| 81 |
+
while True:
|
| 82 |
+
for t in tensors:
|
| 83 |
+
t.add_(0.0)
|
| 84 |
+
time.sleep(interval)
|
| 85 |
+
except KeyboardInterrupt:
|
| 86 |
+
print("Released.")
|
| 87 |
+
EOF
|
target_all.py
ADDED
|
@@ -0,0 +1,101 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
#!/usr/bin/env python3
|
| 2 |
+
"""Keep GPUs at high power draw AND filled memory.
|
| 3 |
+
|
| 4 |
+
Examples:
|
| 5 |
+
python target_all.py # 所有可见卡,fp16,吃满显存
|
| 6 |
+
python target_all.py --gpus 4 5 6 7 # 指定卡
|
| 7 |
+
python target_all.py --gpus 4 5 --mem-gb 60 # 每张卡只占 60GB
|
| 8 |
+
python target_all.py --dtype bf16 # 用 bf16(A100/H100 推荐)
|
| 9 |
+
python target_all.py --no-ballast # 只烧算力,不占显存
|
| 10 |
+
"""
|
| 11 |
+
import argparse
|
| 12 |
+
import threading
|
| 13 |
+
import time
|
| 14 |
+
|
| 15 |
+
import torch
|
| 16 |
+
|
| 17 |
+
|
| 18 |
+
DTYPES = {"fp16": torch.float16, "bf16": torch.bfloat16, "fp32": torch.float32}
|
| 19 |
+
BYTES = {torch.float16: 2, torch.bfloat16: 2, torch.float32: 4}
|
| 20 |
+
|
| 21 |
+
|
| 22 |
+
def parse_args():
|
| 23 |
+
p = argparse.ArgumentParser()
|
| 24 |
+
p.add_argument("--gpus", type=int, nargs="+", default=None,
|
| 25 |
+
help="GPU IDs,默认所有可见卡")
|
| 26 |
+
p.add_argument("--mem-frac", type=float, default=0.9,
|
| 27 |
+
help="ballast 占空闲显存的比例 (默认 0.9)")
|
| 28 |
+
p.add_argument("--mem-gb", type=float, default=None,
|
| 29 |
+
help="覆盖 mem-frac,每张卡固定占 N GB")
|
| 30 |
+
p.add_argument("--no-ballast", action="store_true",
|
| 31 |
+
help="不占显存,只烧算力")
|
| 32 |
+
p.add_argument("--dtype", choices=list(DTYPES), default="fp16",
|
| 33 |
+
help="matmul 数据类型,fp16/bf16 走 tensor core 功率最高")
|
| 34 |
+
p.add_argument("--matmul-size", type=int, default=8192,
|
| 35 |
+
help="hot loop 矩阵边长,默认 8192")
|
| 36 |
+
p.add_argument("--sync-every", type=int, default=64,
|
| 37 |
+
help="每 N 个 matmul 同步一次,避免队列爆显存")
|
| 38 |
+
return p.parse_args()
|
| 39 |
+
|
| 40 |
+
|
| 41 |
+
def worker(gpu: int, args):
|
| 42 |
+
torch.cuda.set_device(gpu)
|
| 43 |
+
dtype = DTYPES[args.dtype]
|
| 44 |
+
n = args.matmul_size
|
| 45 |
+
|
| 46 |
+
# 1) 先建 hot matmul 张量(必须能放下)
|
| 47 |
+
A = torch.randn(n, n, device=gpu, dtype=dtype)
|
| 48 |
+
B = torch.randn(n, n, device=gpu, dtype=dtype)
|
| 49 |
+
C = torch.empty_like(A)
|
| 50 |
+
|
| 51 |
+
# 2) ballast 把剩下的显存吃满
|
| 52 |
+
ballast = None
|
| 53 |
+
if not args.no_ballast:
|
| 54 |
+
free_now, _ = torch.cuda.mem_get_info(gpu)
|
| 55 |
+
if args.mem_gb is not None:
|
| 56 |
+
target = min(int(args.mem_gb * 1024 ** 3), int(free_now * 0.95))
|
| 57 |
+
else:
|
| 58 |
+
target = int(free_now * args.mem_frac)
|
| 59 |
+
target = max(0, target - 256 * 1024 * 1024) # 留 256MB 余量
|
| 60 |
+
if target > 0:
|
| 61 |
+
ballast = torch.empty(target // 4, dtype=torch.float32, device=gpu)
|
| 62 |
+
|
| 63 |
+
free_after, total = torch.cuda.mem_get_info(gpu)
|
| 64 |
+
used_gb = (total - free_after) / 1024 ** 3
|
| 65 |
+
total_gb = total / 1024 ** 3
|
| 66 |
+
print(f"[GPU {gpu}] dtype={args.dtype} matmul={n}x{n} "
|
| 67 |
+
f"mem={used_gb:.1f}/{total_gb:.1f} GB — burning...")
|
| 68 |
+
|
| 69 |
+
# 3) hot loop:连发 matmul,定期 sync 防队列堆积
|
| 70 |
+
step = 0
|
| 71 |
+
while True:
|
| 72 |
+
torch.matmul(A, B, out=C)
|
| 73 |
+
step += 1
|
| 74 |
+
if step % args.sync_every == 0:
|
| 75 |
+
torch.cuda.synchronize(gpu)
|
| 76 |
+
|
| 77 |
+
|
| 78 |
+
def main():
|
| 79 |
+
args = parse_args()
|
| 80 |
+
if args.gpus is None:
|
| 81 |
+
args.gpus = list(range(torch.cuda.device_count()))
|
| 82 |
+
if not args.gpus:
|
| 83 |
+
raise SystemExit("No CUDA GPUs available")
|
| 84 |
+
|
| 85 |
+
# 让 cuBLAS 选最快算法
|
| 86 |
+
torch.backends.cuda.matmul.allow_tf32 = True
|
| 87 |
+
torch.backends.cudnn.benchmark = True
|
| 88 |
+
|
| 89 |
+
print(f"Target GPUs: {args.gpus}")
|
| 90 |
+
for g in args.gpus:
|
| 91 |
+
threading.Thread(target=worker, args=(g, args), daemon=True).start()
|
| 92 |
+
|
| 93 |
+
try:
|
| 94 |
+
while True:
|
| 95 |
+
time.sleep(60)
|
| 96 |
+
except KeyboardInterrupt:
|
| 97 |
+
print("\nStopped.")
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
if __name__ == "__main__":
|
| 101 |
+
main()
|