Brightcodelab's picture
Update app.py
d3a3470 verified
import gradio as gr
import torch
import subprocess
import os
import time
from huggingface_hub import SpaceStage
from huggingface_hub.utils import RepositoryNotFoundError
from huggingface_hub.spaces import get_space_stage
import spaces
@spaces.GPU
def check_gpu():
results = []
# Add timestamp
results.append(f"Test run at: {time.strftime('%Y-%m-%d %H:%M:%S')}")
# Check PyTorch CUDA availability
results.append(f"PyTorch version: {torch.__version__}")
results.append(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
results.append(f"CUDA version: {torch.version.cuda}")
results.append(f"GPU count: {torch.cuda.device_count()}")
for i in range(torch.cuda.device_count()):
props = torch.cuda.get_device_properties(i)
results.append(f"GPU {i}: {props.name}")
results.append(f" - Total memory: {props.total_memory / 1024**3:.2f} GB")
results.append(f" - Compute capability: {props.major}.{props.minor}")
# Test a simple CUDA operation
try:
x = torch.rand(1000, 1000, device="cuda")
y = torch.rand(1000, 1000, device="cuda")
start_time = time.time()
z = torch.matmul(x, y)
torch.cuda.synchronize() # Wait for operation to complete
end_time = time.time()
results.append(f"Matrix multiplication test: {(end_time - start_time)*1000:.2f} ms")
results.append("CUDA operations working correctly ✅")
except Exception as e:
results.append(f"CUDA operation failed: {e}")
# Try nvidia-smi
try:
nvidia_smi = subprocess.check_output("nvidia-smi", shell=True).decode()
results.append("\nNVIDIA-SMI output:")
results.append(nvidia_smi)
except Exception as e:
results.append(f"nvidia-smi error: {e}")
# Check environment variables
cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", "Not set")
results.append(f"\nCUDA_VISIBLE_DEVICES: {cuda_visible_devices}")
return "\n".join(results)
@spaces.GPU
def test_memory_allocation():
try:
# See how much GPU memory we can allocate
max_memory = 0
tensors = []
for size in [100, 500, 1000, 2000, 4000, 8000]:
try:
# Try to allocate a tensor of increasing size
tensor = torch.rand(size, size, device="cuda")
tensors.append(tensor)
memory_allocated = torch.cuda.memory_allocated() / (1024**3) # Convert to GB
max_memory = memory_allocated
result = f"Successfully allocated {size}x{size} tensor. Total memory: {memory_allocated:.2f} GB"
except Exception as e:
result = f"Failed to allocate {size}x{size} tensor: {e}"
break
# Clean up
tensors = None
torch.cuda.empty_cache()
return f"Maximum GPU memory allocated: {max_memory:.2f} GB\n{result}"
except Exception as e:
return f"Memory test failed: {e}"
def get_space_status():
try:
stage = get_space_stage()
if stage == SpaceStage.RUNNING:
status = "Space is running"
elif stage == SpaceStage.BUILDING:
status = "Space is building"
else:
status = f"Space is in stage: {stage}"
except RepositoryNotFoundError:
status = "Not running in a Space"
except Exception as e:
status = f"Error getting Space stage: {e}"
return status
# Create the Gradio interface
with gr.Blocks(title="GPU Test") as demo:
gr.Markdown("# GPU Availability Test")
gr.Markdown("This app checks if GPU/CUDA is available and working in this Hugging Face Space")
# Display Space status
status_text = gr.Markdown(f"**Space Status**: {get_space_status()}")
with gr.Tab("Basic GPU Test"):
check_btn = gr.Button("Check GPU Status", variant="primary")
output = gr.Textbox(label="Results", lines=20)
check_btn.click(fn=check_gpu, outputs=output)
with gr.Tab("Memory Test"):
memory_btn = gr.Button("Test GPU Memory Allocation", variant="primary")
memory_output = gr.Textbox(label="Memory Test Results", lines=5)
memory_btn.click(fn=test_memory_allocation, outputs=memory_output)
# Auto-run the check on page load
demo.load(fn=check_gpu, outputs=output)
demo.launch()