Spaces:
Configuration error
Configuration error
| #!/usr/bin/env python3 | |
| """ | |
| GPU Diagnostics Tool for Hugging Face Spaces | |
| This script performs a comprehensive check of GPU availability and functionality. | |
| """ | |
| import os | |
| import sys | |
| import subprocess | |
| import time | |
| import json | |
| print("=" * 80) | |
| print("GPU DIAGNOSTICS TOOL") | |
| print("=" * 80) | |
| # Check Python version | |
| print(f"Python version: {sys.version}") | |
| print("-" * 80) | |
| # Check environment variables | |
| print("ENVIRONMENT VARIABLES:") | |
| gpu_related_vars = [ | |
| "CUDA_VISIBLE_DEVICES", | |
| "NVIDIA_VISIBLE_DEVICES", | |
| "PYTORCH_CUDA_ALLOC_CONF", | |
| "HF_HOME" | |
| ] | |
| for var in gpu_related_vars: | |
| print(f"{var}: {os.environ.get(var, 'Not set')}") | |
| print("-" * 80) | |
| # Check for nvidia-smi | |
| print("CHECKING FOR NVIDIA-SMI:") | |
| try: | |
| result = subprocess.run(['nvidia-smi'], stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) | |
| if result.returncode == 0: | |
| print("nvidia-smi is available and working!") | |
| print(result.stdout) | |
| else: | |
| print("nvidia-smi error:") | |
| print(result.stderr) | |
| except Exception as e: | |
| print(f"Error running nvidia-smi: {str(e)}") | |
| print("-" * 80) | |
| # Check PyTorch and CUDA | |
| print("CHECKING PYTORCH AND CUDA:") | |
| try: | |
| import torch | |
| print(f"PyTorch version: {torch.__version__}") | |
| print(f"CUDA available: {torch.cuda.is_available()}") | |
| print(f"CUDA version: {torch.version.cuda if torch.cuda.is_available() else 'Not available'}") | |
| if torch.cuda.is_available(): | |
| print(f"CUDA device count: {torch.cuda.device_count()}") | |
| for i in range(torch.cuda.device_count()): | |
| print(f"CUDA Device {i}: {torch.cuda.get_device_name(i)}") | |
| print(f"Current CUDA device: {torch.cuda.current_device()}") | |
| # Try to create and operate on a CUDA tensor | |
| print("\nTesting CUDA tensor creation:") | |
| try: | |
| start_time = time.time() | |
| x = torch.rand(1000, 1000, device="cuda" if torch.cuda.is_available() else "cpu") | |
| y = x @ x # Matrix multiplication to test computation | |
| torch.cuda.synchronize() # Wait for the operation to complete | |
| end_time = time.time() | |
| if torch.cuda.is_available(): | |
| print(f"Successfully created and operated on a CUDA tensor in {end_time - start_time:.4f} seconds") | |
| else: | |
| print(f"Created and operated on a CPU tensor in {end_time - start_time:.4f} seconds (CUDA not available)") | |
| except Exception as e: | |
| print(f"Error in tensor creation/operation: {str(e)}") | |
| # Try to get more detailed CUDA info | |
| if torch.cuda.is_available(): | |
| print("\nDetailed CUDA information:") | |
| print(f"CUDA capability: {torch.cuda.get_device_capability(0)}") | |
| print(f"Total GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB") | |
| print(f"CUDA arch list: {torch.cuda.get_arch_list() if hasattr(torch.cuda, 'get_arch_list') else 'Not available'}") | |
| except ImportError: | |
| print("PyTorch is not installed") | |
| print("-" * 80) | |
| # Create a simple GPU test with a web interface | |
| print("CREATING SIMPLE GPU TEST WEB INTERFACE...") | |
| try: | |
| import gradio as gr | |
| def check_gpu(): | |
| results = { | |
| "python_version": sys.version, | |
| "environment_vars": {var: os.environ.get(var, "Not set") for var in gpu_related_vars}, | |
| "torch_available": False, | |
| "cuda_available": False | |
| } | |
| try: | |
| import torch | |
| results["torch_available"] = True | |
| results["torch_version"] = torch.__version__ | |
| results["cuda_available"] = torch.cuda.is_available() | |
| if torch.cuda.is_available(): | |
| results["cuda_version"] = torch.version.cuda | |
| results["cuda_device_count"] = torch.cuda.device_count() | |
| results["cuda_device_name"] = torch.cuda.get_device_name(0) | |
| # Test tensor creation | |
| start_time = time.time() | |
| x = torch.rand(1000, 1000, device="cuda") | |
| y = x @ x | |
| torch.cuda.synchronize() | |
| end_time = time.time() | |
| results["tensor_test_time"] = f"{end_time - start_time:.4f} seconds" | |
| results["gpu_test_passed"] = True | |
| else: | |
| results["gpu_test_passed"] = False | |
| except Exception as e: | |
| results["error"] = str(e) | |
| results["gpu_test_passed"] = False | |
| return json.dumps(results, indent=2) | |
| demo = gr.Interface( | |
| fn=check_gpu, | |
| inputs=[], | |
| outputs="text", | |
| title="GPU Diagnostics", | |
| description="Click the button to run GPU diagnostics" | |
| ) | |
| print("Starting Gradio web interface on port 7860...") | |
| demo.launch(server_name="0.0.0.0") | |
| except ImportError: | |
| print("Gradio not installed, skipping web interface") | |
| print("Raw GPU diagnostics complete.") | |
| print("-" * 80) |