import gradio as gr import torch import subprocess import os import time from huggingface_hub import SpaceStage from huggingface_hub.utils import RepositoryNotFoundError from huggingface_hub.spaces import get_space_stage import spaces @spaces.GPU def check_gpu(): results = [] # Add timestamp results.append(f"Test run at: {time.strftime('%Y-%m-%d %H:%M:%S')}") # Check PyTorch CUDA availability results.append(f"PyTorch version: {torch.__version__}") results.append(f"CUDA available: {torch.cuda.is_available()}") if torch.cuda.is_available(): results.append(f"CUDA version: {torch.version.cuda}") results.append(f"GPU count: {torch.cuda.device_count()}") for i in range(torch.cuda.device_count()): props = torch.cuda.get_device_properties(i) results.append(f"GPU {i}: {props.name}") results.append(f" - Total memory: {props.total_memory / 1024**3:.2f} GB") results.append(f" - Compute capability: {props.major}.{props.minor}") # Test a simple CUDA operation try: x = torch.rand(1000, 1000, device="cuda") y = torch.rand(1000, 1000, device="cuda") start_time = time.time() z = torch.matmul(x, y) torch.cuda.synchronize() # Wait for operation to complete end_time = time.time() results.append(f"Matrix multiplication test: {(end_time - start_time)*1000:.2f} ms") results.append("CUDA operations working correctly ✅") except Exception as e: results.append(f"CUDA operation failed: {e}") # Try nvidia-smi try: nvidia_smi = subprocess.check_output("nvidia-smi", shell=True).decode() results.append("\nNVIDIA-SMI output:") results.append(nvidia_smi) except Exception as e: results.append(f"nvidia-smi error: {e}") # Check environment variables cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", "Not set") results.append(f"\nCUDA_VISIBLE_DEVICES: {cuda_visible_devices}") return "\n".join(results) @spaces.GPU def test_memory_allocation(): try: # See how much GPU memory we can allocate max_memory = 0 tensors = [] for size in [100, 500, 1000, 2000, 4000, 8000]: try: # Try to allocate a tensor of increasing size tensor = torch.rand(size, size, device="cuda") tensors.append(tensor) memory_allocated = torch.cuda.memory_allocated() / (1024**3) # Convert to GB max_memory = memory_allocated result = f"Successfully allocated {size}x{size} tensor. Total memory: {memory_allocated:.2f} GB" except Exception as e: result = f"Failed to allocate {size}x{size} tensor: {e}" break # Clean up tensors = None torch.cuda.empty_cache() return f"Maximum GPU memory allocated: {max_memory:.2f} GB\n{result}" except Exception as e: return f"Memory test failed: {e}" def get_space_status(): try: stage = get_space_stage() if stage == SpaceStage.RUNNING: status = "Space is running" elif stage == SpaceStage.BUILDING: status = "Space is building" else: status = f"Space is in stage: {stage}" except RepositoryNotFoundError: status = "Not running in a Space" except Exception as e: status = f"Error getting Space stage: {e}" return status # Create the Gradio interface with gr.Blocks(title="GPU Test") as demo: gr.Markdown("# GPU Availability Test") gr.Markdown("This app checks if GPU/CUDA is available and working in this Hugging Face Space") # Display Space status status_text = gr.Markdown(f"**Space Status**: {get_space_status()}") with gr.Tab("Basic GPU Test"): check_btn = gr.Button("Check GPU Status", variant="primary") output = gr.Textbox(label="Results", lines=20) check_btn.click(fn=check_gpu, outputs=output) with gr.Tab("Memory Test"): memory_btn = gr.Button("Test GPU Memory Allocation", variant="primary") memory_output = gr.Textbox(label="Memory Test Results", lines=5) memory_btn.click(fn=test_memory_allocation, outputs=memory_output) # Auto-run the check on page load demo.load(fn=check_gpu, outputs=output) demo.launch()