space-fetch / benchmarks /gpu_bench.py
Orion-zhen's picture
space fetch
0404756
"""
GPU Benchmark Module
GPU 性能测试:显存带宽、FP32/FP16/Tensor Core 算力
仅在有 NVIDIA GPU 时可用
"""
import time
from typing import Dict, Any, Optional
def check_gpu_available() -> bool:
"""检查是否有可用的 GPU"""
try:
import subprocess
result = subprocess.run(['nvidia-smi'], capture_output=True, timeout=5)
return result.returncode == 0
except:
return False
def check_cuda_available() -> bool:
"""检查 PyTorch CUDA 是否可用"""
try:
import torch
return torch.cuda.is_available()
except ImportError:
return False
def benchmark_gpu_memory_bandwidth() -> Optional[Dict[str, Any]]:
"""GPU 显存带宽测试"""
if not check_cuda_available():
return None
try:
import torch
device = torch.device('cuda')
# 测试不同大小
size_mb = 256
size_elements = size_mb * 1024 * 1024 // 4 # float32
# 创建张量
src = torch.ones(size_elements, dtype=torch.float32, device=device)
# 预热
for _ in range(10):
dst = src.clone()
torch.cuda.synchronize()
# 带宽测试
start_time = time.time()
iterations = 0
while time.time() - start_time < 2.0:
dst = src.clone()
iterations += 1
torch.cuda.synchronize()
elapsed = time.time() - start_time
# 计算带宽 (读 + 写)
bytes_transferred = size_mb * 1024 * 1024 * 2 * iterations
bandwidth = bytes_transferred / elapsed / (1024**3)
return {
"test": "gpu_memory_bandwidth",
"description": f"GPU memory bandwidth ({size_mb}MB)",
"size_mb": size_mb,
"duration_seconds": round(elapsed, 3),
"iterations": iterations,
"bandwidth_gb_s": round(bandwidth, 2),
"score": round(bandwidth, 2),
}
except Exception as e:
return {"error": str(e)}
def benchmark_gpu_fp32(matrix_size: int = 4096) -> Optional[Dict[str, Any]]:
"""GPU FP32 算力测试"""
if not check_cuda_available():
return None
try:
import torch
device = torch.device('cuda')
# 创建矩阵
a = torch.randn(matrix_size, matrix_size, dtype=torch.float32, device=device)
b = torch.randn(matrix_size, matrix_size, dtype=torch.float32, device=device)
# 预热
for _ in range(5):
_ = torch.mm(a, b)
torch.cuda.synchronize()
# 测试
start_time = time.time()
iterations = 0
while time.time() - start_time < 3.0:
_ = torch.mm(a, b)
iterations += 1
torch.cuda.synchronize()
elapsed = time.time() - start_time
# 计算 TFLOPS
flops_per_matmul = 2 * (matrix_size ** 3)
total_flops = flops_per_matmul * iterations
tflops = total_flops / elapsed / 1e12
return {
"test": "gpu_fp32",
"description": f"GPU FP32 compute ({matrix_size}x{matrix_size} matmul)",
"matrix_size": matrix_size,
"duration_seconds": round(elapsed, 3),
"iterations": iterations,
"tflops": round(tflops, 3),
"score": round(tflops * 10, 2),
}
except Exception as e:
return {"error": str(e)}
def benchmark_gpu_fp16(matrix_size: int = 4096) -> Optional[Dict[str, Any]]:
"""GPU FP16 算力测试"""
if not check_cuda_available():
return None
try:
import torch
device = torch.device('cuda')
# 创建矩阵
a = torch.randn(matrix_size, matrix_size, dtype=torch.float16, device=device)
b = torch.randn(matrix_size, matrix_size, dtype=torch.float16, device=device)
# 预热
for _ in range(5):
_ = torch.mm(a, b)
torch.cuda.synchronize()
# 测试
start_time = time.time()
iterations = 0
while time.time() - start_time < 3.0:
_ = torch.mm(a, b)
iterations += 1
torch.cuda.synchronize()
elapsed = time.time() - start_time
# 计算 TFLOPS
flops_per_matmul = 2 * (matrix_size ** 3)
total_flops = flops_per_matmul * iterations
tflops = total_flops / elapsed / 1e12
return {
"test": "gpu_fp16",
"description": f"GPU FP16 compute ({matrix_size}x{matrix_size} matmul)",
"matrix_size": matrix_size,
"duration_seconds": round(elapsed, 3),
"iterations": iterations,
"tflops": round(tflops, 3),
"score": round(tflops * 5, 2),
}
except Exception as e:
return {"error": str(e)}
def benchmark_gpu_tensor_cores(matrix_size: int = 4096) -> Optional[Dict[str, Any]]:
"""GPU Tensor Core 混合精度算力测试"""
if not check_cuda_available():
return None
try:
import torch
if not hasattr(torch.cuda, 'amp') or torch.cuda.get_device_capability()[0] < 7:
return {"error": "Tensor Cores not available (requires compute capability >= 7.0)"}
device = torch.device('cuda')
# 使用自动混合精度
a = torch.randn(matrix_size, matrix_size, dtype=torch.float16, device=device)
b = torch.randn(matrix_size, matrix_size, dtype=torch.float16, device=device)
# 预热
with torch.cuda.amp.autocast():
for _ in range(5):
_ = torch.mm(a, b)
torch.cuda.synchronize()
# 测试
start_time = time.time()
iterations = 0
with torch.cuda.amp.autocast():
while time.time() - start_time < 3.0:
_ = torch.mm(a, b)
iterations += 1
torch.cuda.synchronize()
elapsed = time.time() - start_time
# 计算 TFLOPS
flops_per_matmul = 2 * (matrix_size ** 3)
total_flops = flops_per_matmul * iterations
tflops = total_flops / elapsed / 1e12
return {
"test": "gpu_tensor_cores",
"description": f"GPU Tensor Cores mixed precision ({matrix_size}x{matrix_size})",
"matrix_size": matrix_size,
"duration_seconds": round(elapsed, 3),
"iterations": iterations,
"tflops": round(tflops, 3),
"score": round(tflops * 3, 2),
}
except Exception as e:
return {"error": str(e)}
def run_all_gpu_benchmarks() -> Optional[Dict[str, Any]]:
"""运行所有 GPU 基准测试"""
if not check_gpu_available():
return None
results = {}
# 内存带宽
mem_result = benchmark_gpu_memory_bandwidth()
if mem_result:
results["memory_bandwidth"] = mem_result
# FP32 算力
fp32_result = benchmark_gpu_fp32()
if fp32_result:
results["fp32"] = fp32_result
# FP16 算力
fp16_result = benchmark_gpu_fp16()
if fp16_result:
results["fp16"] = fp16_result
# Tensor Cores
tc_result = benchmark_gpu_tensor_cores()
if tc_result:
results["tensor_cores"] = tc_result
if not results:
return None
# 计算总分
total_score = sum(r.get("score", 0) for r in results.values() if isinstance(r, dict) and "error" not in r)
results["total_score"] = round(total_score, 2)
return results