Spaces:

Brightcodelab
/

THUDM-CogVideoX-5b

Paused

File size: 4,545 Bytes

import gradio as gr
import torch
import subprocess
import os
import time
from huggingface_hub import SpaceStage
from huggingface_hub.utils import RepositoryNotFoundError
from huggingface_hub.spaces import get_space_stage
import spaces

@spaces.GPU
def check_gpu():
    results = []
    
    # Add timestamp
    results.append(f"Test run at: {time.strftime('%Y-%m-%d %H:%M:%S')}")
    
    # Check PyTorch CUDA availability
    results.append(f"PyTorch version: {torch.__version__}")
    results.append(f"CUDA available: {torch.cuda.is_available()}")
    
    if torch.cuda.is_available():
        results.append(f"CUDA version: {torch.version.cuda}")
        results.append(f"GPU count: {torch.cuda.device_count()}")
        for i in range(torch.cuda.device_count()):
            props = torch.cuda.get_device_properties(i)
            results.append(f"GPU {i}: {props.name}")
            results.append(f"  - Total memory: {props.total_memory / 1024**3:.2f} GB")
            results.append(f"  - Compute capability: {props.major}.{props.minor}")
        
        # Test a simple CUDA operation
        try:
            x = torch.rand(1000, 1000, device="cuda")
            y = torch.rand(1000, 1000, device="cuda")
            start_time = time.time()
            z = torch.matmul(x, y)
            torch.cuda.synchronize()  # Wait for operation to complete
            end_time = time.time()
            results.append(f"Matrix multiplication test: {(end_time - start_time)*1000:.2f} ms")
            results.append("CUDA operations working correctly ✅")
        except Exception as e:
            results.append(f"CUDA operation failed: {e}")
    
    # Try nvidia-smi
    try:
        nvidia_smi = subprocess.check_output("nvidia-smi", shell=True).decode()
        results.append("\nNVIDIA-SMI output:")
        results.append(nvidia_smi)
    except Exception as e:
        results.append(f"nvidia-smi error: {e}")
    
    # Check environment variables
    cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", "Not set")
    results.append(f"\nCUDA_VISIBLE_DEVICES: {cuda_visible_devices}")
    
    return "\n".join(results)

@spaces.GPU
def test_memory_allocation():
    try:
        # See how much GPU memory we can allocate
        max_memory = 0
        tensors = []
        
        for size in [100, 500, 1000, 2000, 4000, 8000]:
            try:
                # Try to allocate a tensor of increasing size
                tensor = torch.rand(size, size, device="cuda")
                tensors.append(tensor)
                memory_allocated = torch.cuda.memory_allocated() / (1024**3)  # Convert to GB
                max_memory = memory_allocated
                result = f"Successfully allocated {size}x{size} tensor. Total memory: {memory_allocated:.2f} GB"
            except Exception as e:
                result = f"Failed to allocate {size}x{size} tensor: {e}"
                break
                
        # Clean up
        tensors = None
        torch.cuda.empty_cache()
        
        return f"Maximum GPU memory allocated: {max_memory:.2f} GB\n{result}"
    except Exception as e:
        return f"Memory test failed: {e}"

def get_space_status():
    try:
        stage = get_space_stage()
        if stage == SpaceStage.RUNNING:
            status = "Space is running"
        elif stage == SpaceStage.BUILDING:
            status = "Space is building"
        else:
            status = f"Space is in stage: {stage}"
    except RepositoryNotFoundError:
        status = "Not running in a Space"
    except Exception as e:
        status = f"Error getting Space stage: {e}"
    
    return status

# Create the Gradio interface
with gr.Blocks(title="GPU Test") as demo:
    gr.Markdown("# GPU Availability Test")
    gr.Markdown("This app checks if GPU/CUDA is available and working in this Hugging Face Space")
    
    # Display Space status
    status_text = gr.Markdown(f"**Space Status**: {get_space_status()}")
    
    with gr.Tab("Basic GPU Test"):
        check_btn = gr.Button("Check GPU Status", variant="primary")
        output = gr.Textbox(label="Results", lines=20)
        check_btn.click(fn=check_gpu, outputs=output)
    
    with gr.Tab("Memory Test"):
        memory_btn = gr.Button("Test GPU Memory Allocation", variant="primary")
        memory_output = gr.Textbox(label="Memory Test Results", lines=5)
        memory_btn.click(fn=test_memory_allocation, outputs=memory_output)
    
    # Auto-run the check on page load
    demo.load(fn=check_gpu, outputs=output)

demo.launch()