Spaces:
Sleeping
Sleeping
| """ | |
| GPU Benchmark Module | |
| GPU 性能测试:显存带宽、FP32/FP16/Tensor Core 算力 | |
| 仅在有 NVIDIA GPU 时可用 | |
| """ | |
| import time | |
| from typing import Dict, Any, Optional | |
| def check_gpu_available() -> bool: | |
| """检查是否有可用的 GPU""" | |
| try: | |
| import subprocess | |
| result = subprocess.run(['nvidia-smi'], capture_output=True, timeout=5) | |
| return result.returncode == 0 | |
| except: | |
| return False | |
| def check_cuda_available() -> bool: | |
| """检查 PyTorch CUDA 是否可用""" | |
| try: | |
| import torch | |
| return torch.cuda.is_available() | |
| except ImportError: | |
| return False | |
| def benchmark_gpu_memory_bandwidth() -> Optional[Dict[str, Any]]: | |
| """GPU 显存带宽测试""" | |
| if not check_cuda_available(): | |
| return None | |
| try: | |
| import torch | |
| device = torch.device('cuda') | |
| # 测试不同大小 | |
| size_mb = 256 | |
| size_elements = size_mb * 1024 * 1024 // 4 # float32 | |
| # 创建张量 | |
| src = torch.ones(size_elements, dtype=torch.float32, device=device) | |
| # 预热 | |
| for _ in range(10): | |
| dst = src.clone() | |
| torch.cuda.synchronize() | |
| # 带宽测试 | |
| start_time = time.time() | |
| iterations = 0 | |
| while time.time() - start_time < 2.0: | |
| dst = src.clone() | |
| iterations += 1 | |
| torch.cuda.synchronize() | |
| elapsed = time.time() - start_time | |
| # 计算带宽 (读 + 写) | |
| bytes_transferred = size_mb * 1024 * 1024 * 2 * iterations | |
| bandwidth = bytes_transferred / elapsed / (1024**3) | |
| return { | |
| "test": "gpu_memory_bandwidth", | |
| "description": f"GPU memory bandwidth ({size_mb}MB)", | |
| "size_mb": size_mb, | |
| "duration_seconds": round(elapsed, 3), | |
| "iterations": iterations, | |
| "bandwidth_gb_s": round(bandwidth, 2), | |
| "score": round(bandwidth, 2), | |
| } | |
| except Exception as e: | |
| return {"error": str(e)} | |
| def benchmark_gpu_fp32(matrix_size: int = 4096) -> Optional[Dict[str, Any]]: | |
| """GPU FP32 算力测试""" | |
| if not check_cuda_available(): | |
| return None | |
| try: | |
| import torch | |
| device = torch.device('cuda') | |
| # 创建矩阵 | |
| a = torch.randn(matrix_size, matrix_size, dtype=torch.float32, device=device) | |
| b = torch.randn(matrix_size, matrix_size, dtype=torch.float32, device=device) | |
| # 预热 | |
| for _ in range(5): | |
| _ = torch.mm(a, b) | |
| torch.cuda.synchronize() | |
| # 测试 | |
| start_time = time.time() | |
| iterations = 0 | |
| while time.time() - start_time < 3.0: | |
| _ = torch.mm(a, b) | |
| iterations += 1 | |
| torch.cuda.synchronize() | |
| elapsed = time.time() - start_time | |
| # 计算 TFLOPS | |
| flops_per_matmul = 2 * (matrix_size ** 3) | |
| total_flops = flops_per_matmul * iterations | |
| tflops = total_flops / elapsed / 1e12 | |
| return { | |
| "test": "gpu_fp32", | |
| "description": f"GPU FP32 compute ({matrix_size}x{matrix_size} matmul)", | |
| "matrix_size": matrix_size, | |
| "duration_seconds": round(elapsed, 3), | |
| "iterations": iterations, | |
| "tflops": round(tflops, 3), | |
| "score": round(tflops * 10, 2), | |
| } | |
| except Exception as e: | |
| return {"error": str(e)} | |
| def benchmark_gpu_fp16(matrix_size: int = 4096) -> Optional[Dict[str, Any]]: | |
| """GPU FP16 算力测试""" | |
| if not check_cuda_available(): | |
| return None | |
| try: | |
| import torch | |
| device = torch.device('cuda') | |
| # 创建矩阵 | |
| a = torch.randn(matrix_size, matrix_size, dtype=torch.float16, device=device) | |
| b = torch.randn(matrix_size, matrix_size, dtype=torch.float16, device=device) | |
| # 预热 | |
| for _ in range(5): | |
| _ = torch.mm(a, b) | |
| torch.cuda.synchronize() | |
| # 测试 | |
| start_time = time.time() | |
| iterations = 0 | |
| while time.time() - start_time < 3.0: | |
| _ = torch.mm(a, b) | |
| iterations += 1 | |
| torch.cuda.synchronize() | |
| elapsed = time.time() - start_time | |
| # 计算 TFLOPS | |
| flops_per_matmul = 2 * (matrix_size ** 3) | |
| total_flops = flops_per_matmul * iterations | |
| tflops = total_flops / elapsed / 1e12 | |
| return { | |
| "test": "gpu_fp16", | |
| "description": f"GPU FP16 compute ({matrix_size}x{matrix_size} matmul)", | |
| "matrix_size": matrix_size, | |
| "duration_seconds": round(elapsed, 3), | |
| "iterations": iterations, | |
| "tflops": round(tflops, 3), | |
| "score": round(tflops * 5, 2), | |
| } | |
| except Exception as e: | |
| return {"error": str(e)} | |
| def benchmark_gpu_tensor_cores(matrix_size: int = 4096) -> Optional[Dict[str, Any]]: | |
| """GPU Tensor Core 混合精度算力测试""" | |
| if not check_cuda_available(): | |
| return None | |
| try: | |
| import torch | |
| if not hasattr(torch.cuda, 'amp') or torch.cuda.get_device_capability()[0] < 7: | |
| return {"error": "Tensor Cores not available (requires compute capability >= 7.0)"} | |
| device = torch.device('cuda') | |
| # 使用自动混合精度 | |
| a = torch.randn(matrix_size, matrix_size, dtype=torch.float16, device=device) | |
| b = torch.randn(matrix_size, matrix_size, dtype=torch.float16, device=device) | |
| # 预热 | |
| with torch.cuda.amp.autocast(): | |
| for _ in range(5): | |
| _ = torch.mm(a, b) | |
| torch.cuda.synchronize() | |
| # 测试 | |
| start_time = time.time() | |
| iterations = 0 | |
| with torch.cuda.amp.autocast(): | |
| while time.time() - start_time < 3.0: | |
| _ = torch.mm(a, b) | |
| iterations += 1 | |
| torch.cuda.synchronize() | |
| elapsed = time.time() - start_time | |
| # 计算 TFLOPS | |
| flops_per_matmul = 2 * (matrix_size ** 3) | |
| total_flops = flops_per_matmul * iterations | |
| tflops = total_flops / elapsed / 1e12 | |
| return { | |
| "test": "gpu_tensor_cores", | |
| "description": f"GPU Tensor Cores mixed precision ({matrix_size}x{matrix_size})", | |
| "matrix_size": matrix_size, | |
| "duration_seconds": round(elapsed, 3), | |
| "iterations": iterations, | |
| "tflops": round(tflops, 3), | |
| "score": round(tflops * 3, 2), | |
| } | |
| except Exception as e: | |
| return {"error": str(e)} | |
| def run_all_gpu_benchmarks() -> Optional[Dict[str, Any]]: | |
| """运行所有 GPU 基准测试""" | |
| if not check_gpu_available(): | |
| return None | |
| results = {} | |
| # 内存带宽 | |
| mem_result = benchmark_gpu_memory_bandwidth() | |
| if mem_result: | |
| results["memory_bandwidth"] = mem_result | |
| # FP32 算力 | |
| fp32_result = benchmark_gpu_fp32() | |
| if fp32_result: | |
| results["fp32"] = fp32_result | |
| # FP16 算力 | |
| fp16_result = benchmark_gpu_fp16() | |
| if fp16_result: | |
| results["fp16"] = fp16_result | |
| # Tensor Cores | |
| tc_result = benchmark_gpu_tensor_cores() | |
| if tc_result: | |
| results["tensor_cores"] = tc_result | |
| if not results: | |
| return None | |
| # 计算总分 | |
| total_score = sum(r.get("score", 0) for r in results.values() if isinstance(r, dict) and "error" not in r) | |
| results["total_score"] = round(total_score, 2) | |
| return results | |