File size: 4,545 Bytes
c1a9cc1
534834b
077e894
7c7ed5d
 
d3a3470
 
 
 
7c7ed5d
d3a3470
7c7ed5d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
534834b
d3a3470
7c7ed5d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
534834b
d3a3470
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
7c7ed5d
 
 
 
 
d3a3470
 
 
7c7ed5d
 
 
 
 
 
 
 
 
 
 
 
534834b
7c7ed5d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
import gradio as gr
import torch
import subprocess
import os
import time
from huggingface_hub import SpaceStage
from huggingface_hub.utils import RepositoryNotFoundError
from huggingface_hub.spaces import get_space_stage
import spaces

@spaces.GPU
def check_gpu():
    results = []
    
    # Add timestamp
    results.append(f"Test run at: {time.strftime('%Y-%m-%d %H:%M:%S')}")
    
    # Check PyTorch CUDA availability
    results.append(f"PyTorch version: {torch.__version__}")
    results.append(f"CUDA available: {torch.cuda.is_available()}")
    
    if torch.cuda.is_available():
        results.append(f"CUDA version: {torch.version.cuda}")
        results.append(f"GPU count: {torch.cuda.device_count()}")
        for i in range(torch.cuda.device_count()):
            props = torch.cuda.get_device_properties(i)
            results.append(f"GPU {i}: {props.name}")
            results.append(f"  - Total memory: {props.total_memory / 1024**3:.2f} GB")
            results.append(f"  - Compute capability: {props.major}.{props.minor}")
        
        # Test a simple CUDA operation
        try:
            x = torch.rand(1000, 1000, device="cuda")
            y = torch.rand(1000, 1000, device="cuda")
            start_time = time.time()
            z = torch.matmul(x, y)
            torch.cuda.synchronize()  # Wait for operation to complete
            end_time = time.time()
            results.append(f"Matrix multiplication test: {(end_time - start_time)*1000:.2f} ms")
            results.append("CUDA operations working correctly ✅")
        except Exception as e:
            results.append(f"CUDA operation failed: {e}")
    
    # Try nvidia-smi
    try:
        nvidia_smi = subprocess.check_output("nvidia-smi", shell=True).decode()
        results.append("\nNVIDIA-SMI output:")
        results.append(nvidia_smi)
    except Exception as e:
        results.append(f"nvidia-smi error: {e}")
    
    # Check environment variables
    cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", "Not set")
    results.append(f"\nCUDA_VISIBLE_DEVICES: {cuda_visible_devices}")
    
    return "\n".join(results)

@spaces.GPU
def test_memory_allocation():
    try:
        # See how much GPU memory we can allocate
        max_memory = 0
        tensors = []
        
        for size in [100, 500, 1000, 2000, 4000, 8000]:
            try:
                # Try to allocate a tensor of increasing size
                tensor = torch.rand(size, size, device="cuda")
                tensors.append(tensor)
                memory_allocated = torch.cuda.memory_allocated() / (1024**3)  # Convert to GB
                max_memory = memory_allocated
                result = f"Successfully allocated {size}x{size} tensor. Total memory: {memory_allocated:.2f} GB"
            except Exception as e:
                result = f"Failed to allocate {size}x{size} tensor: {e}"
                break
                
        # Clean up
        tensors = None
        torch.cuda.empty_cache()
        
        return f"Maximum GPU memory allocated: {max_memory:.2f} GB\n{result}"
    except Exception as e:
        return f"Memory test failed: {e}"

def get_space_status():
    try:
        stage = get_space_stage()
        if stage == SpaceStage.RUNNING:
            status = "Space is running"
        elif stage == SpaceStage.BUILDING:
            status = "Space is building"
        else:
            status = f"Space is in stage: {stage}"
    except RepositoryNotFoundError:
        status = "Not running in a Space"
    except Exception as e:
        status = f"Error getting Space stage: {e}"
    
    return status

# Create the Gradio interface
with gr.Blocks(title="GPU Test") as demo:
    gr.Markdown("# GPU Availability Test")
    gr.Markdown("This app checks if GPU/CUDA is available and working in this Hugging Face Space")
    
    # Display Space status
    status_text = gr.Markdown(f"**Space Status**: {get_space_status()}")
    
    with gr.Tab("Basic GPU Test"):
        check_btn = gr.Button("Check GPU Status", variant="primary")
        output = gr.Textbox(label="Results", lines=20)
        check_btn.click(fn=check_gpu, outputs=output)
    
    with gr.Tab("Memory Test"):
        memory_btn = gr.Button("Test GPU Memory Allocation", variant="primary")
        memory_output = gr.Textbox(label="Memory Test Results", lines=5)
        memory_btn.click(fn=test_memory_allocation, outputs=memory_output)
    
    # Auto-run the check on page load
    demo.load(fn=check_gpu, outputs=output)

demo.launch()