Spaces:

Orion-zhen
/

space-fetch

Sleeping

App Files Files Community

space-fetch / benchmarks /gpu_bench.py

Orion-zhen

space fetch

0404756 30 days ago

raw

history blame contribute delete

7.69 kB

	"""
	GPU Benchmark Module
	GPU 性能测试：显存带宽、FP32/FP16/Tensor Core 算力
	仅在有 NVIDIA GPU 时可用
	"""

	import time
	from typing import Dict, Any, Optional


	def check_gpu_available() -> bool:
	"""检查是否有可用的 GPU"""
	try:
	import subprocess
	result = subprocess.run(['nvidia-smi'], capture_output=True, timeout=5)
	return result.returncode == 0
	except:
	return False


	def check_cuda_available() -> bool:
	"""检查 PyTorch CUDA 是否可用"""
	try:
	import torch
	return torch.cuda.is_available()
	except ImportError:
	return False


	def benchmark_gpu_memory_bandwidth() -> Optional[Dict[str, Any]]:
	"""GPU 显存带宽测试"""
	if not check_cuda_available():
	return None

	try:
	import torch

	device = torch.device('cuda')

	# 测试不同大小
	size_mb = 256
	size_elements = size_mb * 1024 * 1024 // 4 # float32

	# 创建张量
	src = torch.ones(size_elements, dtype=torch.float32, device=device)

	# 预热
	for _ in range(10):
	dst = src.clone()
	torch.cuda.synchronize()

	# 带宽测试
	start_time = time.time()
	iterations = 0
	while time.time() - start_time < 2.0:
	dst = src.clone()
	iterations += 1
	torch.cuda.synchronize()
	elapsed = time.time() - start_time

	# 计算带宽 (读 + 写)
	bytes_transferred = size_mb * 1024 * 1024 * 2 * iterations
	bandwidth = bytes_transferred / elapsed / (1024**3)

	return {
	"test": "gpu_memory_bandwidth",
	"description": f"GPU memory bandwidth ({size_mb}MB)",
	"size_mb": size_mb,
	"duration_seconds": round(elapsed, 3),
	"iterations": iterations,
	"bandwidth_gb_s": round(bandwidth, 2),
	"score": round(bandwidth, 2),
	}
	except Exception as e:
	return {"error": str(e)}


	def benchmark_gpu_fp32(matrix_size: int = 4096) -> Optional[Dict[str, Any]]:
	"""GPU FP32 算力测试"""
	if not check_cuda_available():
	return None

	try:
	import torch

	device = torch.device('cuda')

	# 创建矩阵
	a = torch.randn(matrix_size, matrix_size, dtype=torch.float32, device=device)
	b = torch.randn(matrix_size, matrix_size, dtype=torch.float32, device=device)

	# 预热
	for _ in range(5):
	_ = torch.mm(a, b)
	torch.cuda.synchronize()

	# 测试
	start_time = time.time()
	iterations = 0
	while time.time() - start_time < 3.0:
	_ = torch.mm(a, b)
	iterations += 1
	torch.cuda.synchronize()
	elapsed = time.time() - start_time

	# 计算 TFLOPS
	flops_per_matmul = 2 * (matrix_size ** 3)
	total_flops = flops_per_matmul * iterations
	tflops = total_flops / elapsed / 1e12

	return {
	"test": "gpu_fp32",
	"description": f"GPU FP32 compute ({matrix_size}x{matrix_size} matmul)",
	"matrix_size": matrix_size,
	"duration_seconds": round(elapsed, 3),
	"iterations": iterations,
	"tflops": round(tflops, 3),
	"score": round(tflops * 10, 2),
	}
	except Exception as e:
	return {"error": str(e)}


	def benchmark_gpu_fp16(matrix_size: int = 4096) -> Optional[Dict[str, Any]]:
	"""GPU FP16 算力测试"""
	if not check_cuda_available():
	return None

	try:
	import torch

	device = torch.device('cuda')

	# 创建矩阵
	a = torch.randn(matrix_size, matrix_size, dtype=torch.float16, device=device)
	b = torch.randn(matrix_size, matrix_size, dtype=torch.float16, device=device)

	# 预热
	for _ in range(5):
	_ = torch.mm(a, b)
	torch.cuda.synchronize()

	# 测试
	start_time = time.time()
	iterations = 0
	while time.time() - start_time < 3.0:
	_ = torch.mm(a, b)
	iterations += 1
	torch.cuda.synchronize()
	elapsed = time.time() - start_time

	# 计算 TFLOPS
	flops_per_matmul = 2 * (matrix_size ** 3)
	total_flops = flops_per_matmul * iterations
	tflops = total_flops / elapsed / 1e12

	return {
	"test": "gpu_fp16",
	"description": f"GPU FP16 compute ({matrix_size}x{matrix_size} matmul)",
	"matrix_size": matrix_size,
	"duration_seconds": round(elapsed, 3),
	"iterations": iterations,
	"tflops": round(tflops, 3),
	"score": round(tflops * 5, 2),
	}
	except Exception as e:
	return {"error": str(e)}


	def benchmark_gpu_tensor_cores(matrix_size: int = 4096) -> Optional[Dict[str, Any]]:
	"""GPU Tensor Core 混合精度算力测试"""
	if not check_cuda_available():
	return None

	try:
	import torch

	if not hasattr(torch.cuda, 'amp') or torch.cuda.get_device_capability()[0] < 7:
	return {"error": "Tensor Cores not available (requires compute capability >= 7.0)"}

	device = torch.device('cuda')

	# 使用自动混合精度
	a = torch.randn(matrix_size, matrix_size, dtype=torch.float16, device=device)
	b = torch.randn(matrix_size, matrix_size, dtype=torch.float16, device=device)

	# 预热
	with torch.cuda.amp.autocast():
	for _ in range(5):
	_ = torch.mm(a, b)
	torch.cuda.synchronize()

	# 测试
	start_time = time.time()
	iterations = 0
	with torch.cuda.amp.autocast():
	while time.time() - start_time < 3.0:
	_ = torch.mm(a, b)
	iterations += 1
	torch.cuda.synchronize()
	elapsed = time.time() - start_time

	# 计算 TFLOPS
	flops_per_matmul = 2 * (matrix_size ** 3)
	total_flops = flops_per_matmul * iterations
	tflops = total_flops / elapsed / 1e12

	return {
	"test": "gpu_tensor_cores",
	"description": f"GPU Tensor Cores mixed precision ({matrix_size}x{matrix_size})",
	"matrix_size": matrix_size,
	"duration_seconds": round(elapsed, 3),
	"iterations": iterations,
	"tflops": round(tflops, 3),
	"score": round(tflops * 3, 2),
	}
	except Exception as e:
	return {"error": str(e)}


	def run_all_gpu_benchmarks() -> Optional[Dict[str, Any]]:
	"""运行所有 GPU 基准测试"""
	if not check_gpu_available():
	return None

	results = {}

	# 内存带宽
	mem_result = benchmark_gpu_memory_bandwidth()
	if mem_result:
	results["memory_bandwidth"] = mem_result

	# FP32 算力
	fp32_result = benchmark_gpu_fp32()
	if fp32_result:
	results["fp32"] = fp32_result

	# FP16 算力
	fp16_result = benchmark_gpu_fp16()
	if fp16_result:
	results["fp16"] = fp16_result

	# Tensor Cores
	tc_result = benchmark_gpu_tensor_cores()
	if tc_result:
	results["tensor_cores"] = tc_result

	if not results:
	return None

	# 计算总分
	total_score = sum(r.get("score", 0) for r in results.values() if isinstance(r, dict) and "error" not in r)
	results["total_score"] = round(total_score, 2)

	return results