Wendy-Fly
/

Sound

Model card Files Files and versions

xet

Community

Wendy-Fly commited on 24 days ago

Commit

3451841

verified ·

1 Parent(s): 281da15

Upload 2 files

Browse files

Files changed (2) hide show

infer_and_wait_all.sh +87 -0
target_all.py +101 -0

infer_and_wait_all.sh ADDED Viewed

	@@ -0,0 +1,87 @@

+#!/bin/bash
+# Usage:
+#   ./infer_and_wait_all.sh              # 默认占后 4 张卡
+#   ./infer_and_wait_all.sh 0 1 2 3      # 占指定卡
+#   ./infer_and_wait_all.sh -s 70 4 5    # 每张卡占 70GB
+#   ./infer_and_wait_all.sh -w 50000 0   # 等到该卡空闲 >50GB 再占
+#
+# Flags:
+#   -s, --size      每张卡要占的显存大小 (GB)，默认尽量占满 (95% 空闲)
+#   -w, --wait      等待阈值 (MB)，达到后再占，默认不等
+#   -i, --interval  保活刷新间隔 (s)，默认 0.5
+set -e
+SIZE_GB=""        # 空 = 自动占 95% 空闲
+WAIT_MB=0
+INTERVAL=0.5
+while [[ "$1" =~ ^- ]]; do
+    case "$1" in
+        -s|--size)     SIZE_GB="$2"; shift 2 ;;
+        -w|--wait)     WAIT_MB="$2"; shift 2 ;;
+        -i|--interval) INTERVAL="$2"; shift 2 ;;
+        -h|--help)     sed -n '2,12p' "$0"; exit 0 ;;
+        *) echo "Unknown option: $1"; exit 1 ;;
+    esac
+done
+# 决定要占的 GPU
+if [ $# -eq 0 ]; then
+    TOTAL=$(nvidia-smi --query-gpu=name --format=csv,noheader | wc -l)
+    if [ "$TOTAL" -le 4 ]; then
+        GPU_IDS=$(seq 0 $((TOTAL-1)))
+    else
+        GPU_IDS=$(seq $((TOTAL-4)) $((TOTAL-1)))
+    fi
+else
+    GPU_IDS="$@"
+fi
+GPU_LIST=$(echo $GPU_IDS | tr ' ' ',')
+echo "Target GPUs: $GPU_LIST"
+# 可选：等待显存空闲
+if [ "$WAIT_MB" -gt 0 ]; then
+    echo "Waiting until each GPU has >= ${WAIT_MB} MB free..."
+    while true; do
+        ALL_OK=1
+        for g in $GPU_IDS; do
+            FREE=$(nvidia-smi --query-gpu=memory.free --format=csv,nounits,noheader | sed -n "$((g+1))p")
+            echo "  GPU $g free: ${FREE} MB"
+            [ "$FREE" -lt "$WAIT_MB" ] && ALL_OK=0
+        done
+        [ "$ALL_OK" -eq 1 ] && break
+        sleep 10
+    done
+fi
+# 占卡
+export CUDA_VISIBLE_DEVICES=$GPU_LIST
+python - <<EOF
+import torch, time, os
+gpu_ids = os.environ["CUDA_VISIBLE_DEVICES"].split(",")
+n = torch.cuda.device_count()
+size_gb = "${SIZE_GB}"
+size_gb = float(size_gb) if size_gb else None
+interval = float("${INTERVAL}")
+tensors = []
+for i in range(n):
+    free, total = torch.cuda.mem_get_info(i)
+    if size_gb is None:
+        target = int(free * 0.95)
+    else:
+        target = min(int(size_gb * 1024**3), int(free * 0.95))
+    elems = target // 4  # float32
+    t = torch.empty(elems, dtype=torch.float32, device=f"cuda:{i}")
+    tensors.append(t)
+    print(f"GPU {gpu_ids[i]} -> occupied {target/1024**3:.2f} GB / free was {free/1024**3:.2f} GB")
+print(f"All {n} GPU(s) occupied. Ctrl+C to release.")
+try:
+    while True:
+        for t in tensors:
+            t.add_(0.0)
+        time.sleep(interval)
+except KeyboardInterrupt:
+    print("Released.")
+EOF

target_all.py ADDED Viewed

	@@ -0,0 +1,101 @@

+#!/usr/bin/env python3
+"""Keep GPUs at high power draw AND filled memory.
+Examples:
+  python target_all.py                              # 所有可见卡，fp16，吃满显存
+  python target_all.py --gpus 4 5 6 7               # 指定卡
+  python target_all.py --gpus 4 5 --mem-gb 60       # 每张卡只占 60GB
+  python target_all.py --dtype bf16                 # 用 bf16（A100/H100 推荐）
+  python target_all.py --no-ballast                 # 只烧算力，不占显存
+"""
+import argparse
+import threading
+import time
+import torch
+DTYPES = {"fp16": torch.float16, "bf16": torch.bfloat16, "fp32": torch.float32}
+BYTES = {torch.float16: 2, torch.bfloat16: 2, torch.float32: 4}
+def parse_args():
+    p = argparse.ArgumentParser()
+    p.add_argument("--gpus", type=int, nargs="+", default=None,
+                   help="GPU IDs，默认所有可见卡")
+    p.add_argument("--mem-frac", type=float, default=0.9,
+                   help="ballast 占空闲显存的比例 (默认 0.9)")
+    p.add_argument("--mem-gb", type=float, default=None,
+                   help="覆盖 mem-frac，每张卡固定占 N GB")
+    p.add_argument("--no-ballast", action="store_true",
+                   help="不占显存，只烧算力")
+    p.add_argument("--dtype", choices=list(DTYPES), default="fp16",
+                   help="matmul 数据类型，fp16/bf16 走 tensor core 功率最高")
+    p.add_argument("--matmul-size", type=int, default=8192,
+                   help="hot loop 矩阵边长，默认 8192")
+    p.add_argument("--sync-every", type=int, default=64,
+                   help="每 N 个 matmul 同步一次，避免队列爆显存")
+    return p.parse_args()
+def worker(gpu: int, args):
+    torch.cuda.set_device(gpu)
+    dtype = DTYPES[args.dtype]
+    n = args.matmul_size
+    # 1) 先建 hot matmul 张量（必须能放下）
+    A = torch.randn(n, n, device=gpu, dtype=dtype)
+    B = torch.randn(n, n, device=gpu, dtype=dtype)
+    C = torch.empty_like(A)
+    # 2) ballast 把剩下的显存吃满
+    ballast = None
+    if not args.no_ballast:
+        free_now, _ = torch.cuda.mem_get_info(gpu)
+        if args.mem_gb is not None:
+            target = min(int(args.mem_gb * 1024 ** 3), int(free_now * 0.95))
+        else:
+            target = int(free_now * args.mem_frac)
+        target = max(0, target - 256 * 1024 * 1024)  # 留 256MB 余量
+        if target > 0:
+            ballast = torch.empty(target // 4, dtype=torch.float32, device=gpu)
+    free_after, total = torch.cuda.mem_get_info(gpu)
+    used_gb = (total - free_after) / 1024 ** 3
+    total_gb = total / 1024 ** 3
+    print(f"[GPU {gpu}] dtype={args.dtype} matmul={n}x{n} "
+          f"mem={used_gb:.1f}/{total_gb:.1f} GB — burning...")
+    # 3) hot loop：连发 matmul，定期 sync 防队列堆积
+    step = 0
+    while True:
+        torch.matmul(A, B, out=C)
+        step += 1
+        if step % args.sync_every == 0:
+            torch.cuda.synchronize(gpu)
+def main():
+    args = parse_args()
+    if args.gpus is None:
+        args.gpus = list(range(torch.cuda.device_count()))
+    if not args.gpus:
+        raise SystemExit("No CUDA GPUs available")
+    # 让 cuBLAS 选最快算法
+    torch.backends.cuda.matmul.allow_tf32 = True
+    torch.backends.cudnn.benchmark = True
+    print(f"Target GPUs: {args.gpus}")
+    for g in args.gpus:
+        threading.Thread(target=worker, args=(g, args), daemon=True).start()
+    try:
+        while True:
+            time.sleep(60)
+    except KeyboardInterrupt:
+        print("\nStopped.")
+if __name__ == "__main__":
+    main()