Spaces:

RajBhope
/

gpu-runtime-predictor

Sleeping

App Files Files Community

RajBhope commited on 24 days ago

Commit

9ca80ec

verified ·

1 Parent(s): ee9c452

Upload app.py

Browse files

Files changed (1) hide show

app.py +542 -0

app.py ADDED Viewed

	@@ -0,0 +1,542 @@

+"""
+GPU Runtime Predictor - Gradio Space
+=====================================
+Paste your PyTorch/CUDA code, select GPUs from the catalog,
+and get predicted runtimes for each GPU.
+"""
+import gradio as gr
+import numpy as np
+import pandas as pd
+import torch
+import torch.nn as nn
+import json
+import re
+import pickle
+import os
+from huggingface_hub import hf_hub_download
+# ============================================================================
+# LOAD MODEL ARTIFACTS
+# ============================================================================
+MODEL_REPO = "RajBhope/gpu-runtime-predictor"
+def download_artifacts():
+    """Download all model artifacts from Hub."""
+    files = ['model_gbr.pkl', 'model_rf.pkl', 'model_nn.pt', 'scaler_X.pkl',
+             'scaler_params.json', 'gpu_catalog.json', 'nn_config.json', 'metrics.json']
+    paths = {}
+    for f in files:
+        paths[f] = hf_hub_download(repo_id=MODEL_REPO, filename=f)
+    return paths
+print("Downloading model artifacts...")
+artifact_paths = download_artifacts()
+# Load models
+with open(artifact_paths['model_gbr.pkl'], 'rb') as f:
+    model_gbr = pickle.load(f)
+with open(artifact_paths['model_rf.pkl'], 'rb') as f:
+    model_rf = pickle.load(f)
+with open(artifact_paths['scaler_X.pkl'], 'rb') as f:
+    scaler_X = pickle.load(f)
+with open(artifact_paths['scaler_params.json'], 'r') as f:
+    scaler_params = json.load(f)
+with open(artifact_paths['gpu_catalog.json'], 'r') as f:
+    GPU_CATALOG = json.load(f)
+with open(artifact_paths['nn_config.json'], 'r') as f:
+    nn_config = json.load(f)
+with open(artifact_paths['metrics.json'], 'r') as f:
+    metrics = json.load(f)
+# Load NN model
+class RuntimeMLP(nn.Module):
+    def __init__(self, input_dim, hidden_dims=[512, 256, 128], dropout=0.15):
+        super().__init__()
+        layers = []
+        prev_dim = input_dim
+        for h_dim in hidden_dims:
+            layers.extend([
+                nn.Linear(prev_dim, h_dim),
+                nn.LayerNorm(h_dim),
+                nn.GELU(),
+                nn.Dropout(dropout),
+            ])
+            prev_dim = h_dim
+        layers.append(nn.Linear(prev_dim, 1))
+        self.net = nn.Sequential(*layers)
+    def forward(self, x):
+        return self.net(x).squeeze(-1)
+model_nn = RuntimeMLP(**nn_config)
+model_nn.load_state_dict(torch.load(artifact_paths['model_nn.pt'], map_location='cpu', weights_only=True))
+model_nn.eval()
+GPU_FEATURE_COLS = [
+    'cuda_cores', 'tensor_cores', 'memory_gb', 'memory_bandwidth_gbps',
+    'base_clock_mhz', 'boost_clock_mhz', 'sm_count', 'fp32_tflops',
+    'fp16_tflops', 'tdp_watts', 'compute_capability', 'l2_cache_mb',
+]
+print("Models loaded!")
+# ============================================================================
+# CODE FEATURE EXTRACTION
+# ============================================================================
+def extract_code_features(code_text):
+    """Extract features from source code text."""
+    features = {}
+    lines = code_text.strip().split('\n')
+    features['num_lines'] = len(lines)
+    features['num_chars'] = len(code_text)
+    features['avg_line_length'] = np.mean([len(l) for l in lines]) if lines else 0
+    tokens = re.findall(r'[a-zA-Z_]\w*|[0-9]+\.?[0-9]*', code_text)
+    features['num_tokens'] = len(tokens)
+    numbers = re.findall(r'\b(\d+\.?\d*)\b', code_text)
+    nums = [float(n) for n in numbers if n]
+    features['num_numeric_literals'] = len(nums)
+    features['max_numeric'] = max(nums) if nums else 0
+    features['min_numeric'] = min(nums) if nums else 0
+    features['mean_numeric'] = np.mean(nums) if nums else 0
+    features['sum_numeric_log'] = np.log1p(sum(nums)) if nums else 0
+    large_nums = [n for n in nums if n >= 64]
+    features['num_large_dims'] = len(large_nums)
+    features['product_large_dims_log'] = np.log1p(np.prod(large_nums[:5])) if large_nums else 0
+    pytorch_ops = {
+        'matmul': r'torch\.matmul|torch\.mm|@',
+        'conv': r'Conv[12]d|conv[12]d',
+        'attention': r'attention|Attention|MultiheadAttention|softmax.*matmul',
+        'linear': r'nn\.Linear|linear',
+        'batchnorm': r'BatchNorm|batchnorm',
+        'layernorm': r'LayerNorm|layernorm',
+        'softmax': r'softmax|Softmax',
+        'relu': r'relu|ReLU',
+        'gelu': r'gelu|GELU',
+        'sigmoid': r'sigmoid|Sigmoid',
+        'tanh': r'tanh|Tanh',
+        'dropout': r'Dropout|dropout',
+        'embedding': r'Embedding|embedding',
+        'pooling': r'Pool|pool|MaxPool|AvgPool',
+        'fft': r'fft|FFT',
+        'sort': r'torch\.sort',
+        'backward': r'backward|grad',
+        'loss': r'Loss|loss|CrossEntropy',
+        'cat': r'torch\.cat|concatenate',
+        'reshape': r'reshape|view|contiguous',
+        'transpose': r'transpose|\.t\(\)|permute',
+        'reduce': r'torch\.sum|torch\.mean|torch\.max|torch\.min|reduce',
+    }
+    for op_name, pattern in pytorch_ops.items():
+        features[f'has_{op_name}'] = 1 if re.search(pattern, code_text) else 0
+    features['uses_float16'] = 1 if re.search(r'float16|half|fp16', code_text) else 0
+    features['uses_float32'] = 1 if re.search(r'float32|float(?!16)', code_text) else 0
+    features['uses_cuda'] = 1 if re.search(r"'cuda'|\.cuda\(\)|device='cuda'", code_text) else 0
+    features['num_for_loops'] = len(re.findall(r'\bfor\b', code_text))
+    features['num_function_defs'] = len(re.findall(r'\bdef\b', code_text))
+    features['num_class_defs'] = len(re.findall(r'\bclass\b', code_text))
+    features['num_imports'] = len(re.findall(r'\bimport\b', code_text))
+    features['num_torch_calls'] = len(re.findall(r'torch\.', code_text))
+    features['num_nn_calls'] = len(re.findall(r'nn\.', code_text))
+    dim_patterns = [r'\((\d+),\s*(\d+)\)', r'\((\d+),\s*(\d+),\s*(\d+)\)', r'\((\d+),\s*(\d+),\s*(\d+),\s*(\d+)\)']
+    all_dims = []
+    for pattern in dim_patterns:
+        for match in re.finditer(pattern, code_text):
+            dims = [int(g) for g in match.groups()]
+            all_dims.extend(dims)
+    features['num_dim_specs'] = len(all_dims)
+    features['max_dim'] = max(all_dims) if all_dims else 0
+    features['total_elements_log'] = 0
+    if all_dims:
+        tuples = re.findall(r'\([\d,\s]+\)', code_text)
+        for t in tuples:
+            dims = [int(d) for d in re.findall(r'\d+', t)]
+            if len(dims) >= 2:
+                prod = 1
+                for d in dims:
+                    prod *= d
+                features['total_elements_log'] = max(features['total_elements_log'], np.log1p(prod))
+    features['compute_bound_score'] = features.get('has_matmul', 0) + features.get('has_conv', 0) + features.get('has_linear', 0)
+    features['memory_bound_score'] = features.get('has_embedding', 0) + features.get('has_cat', 0) + features.get('has_transpose', 0) + features.get('has_relu', 0)
+    features['mixed_score'] = features.get('has_attention', 0) + features.get('has_batchnorm', 0) + features.get('has_layernorm', 0)
+    return features
+def estimate_flops_and_memory(code_text):
+    """Heuristic estimate of FLOPs and memory bytes from code."""
+    numbers = re.findall(r'\b(\d+)\b', code_text)
+    nums = [int(n) for n in numbers if int(n) > 0]
+    # Detect dtype
+    dtype_bytes = 2 if re.search(r'float16|half', code_text) else 4
+    # Try to identify tensor dimensions for FLOPs estimation
+    flops = 0
+    memory = 0
+    # Matrix multiplication: look for matmul patterns
+    if re.search(r'matmul|torch\.mm|@', code_text):
+        dims = [n for n in nums if n >= 8]
+        if len(dims) >= 3:
+            M, K, N = dims[0], dims[1], dims[2] if len(dims) > 2 else dims[1]
+            flops = 2 * M * N * K
+            memory = dtype_bytes * (M*K + K*N + M*N)
+    # Conv2D
+    elif re.search(r'Conv[12]d', code_text):
+        dims = [n for n in nums if n >= 1]
+        if len(dims) >= 5:
+            batch, in_ch, out_ch = dims[0], dims[1], dims[2]
+            H = W = dims[3] if len(dims) > 3 else 56
+            ks = dims[4] if len(dims) > 4 else 3
+            flops = 2 * batch * out_ch * H * W * in_ch * ks * ks
+            memory = dtype_bytes * (batch*in_ch*H*W + out_ch*in_ch*ks*ks + batch*out_ch*H*W)
+    # Attention
+    elif re.search(r'attention|Attention', code_text):
+        dims = [n for n in nums if n >= 4]
+        if len(dims) >= 3:
+            batch, seq_len, hidden = dims[0], dims[1], dims[2]
+            flops = 4 * batch * seq_len * seq_len * hidden
+            memory = dtype_bytes * batch * 3 * seq_len * hidden * 2
+    # Linear
+    elif re.search(r'nn\.Linear', code_text):
+        dims = [n for n in nums if n >= 8]
+        if len(dims) >= 2:
+            in_f, out_f = dims[0], dims[1]
+            batch = dims[2] if len(dims) > 2 else 1
+            flops = 2 * batch * in_f * out_f
+            memory = dtype_bytes * (batch * in_f + in_f * out_f + batch * out_f)
+    # Generic fallback: estimate from tensor sizes
+    if flops == 0:
+        large_nums = sorted([n for n in nums if n >= 32], reverse=True)[:4]
+        if large_nums:
+            total_elements = 1
+            for n in large_nums:
+                total_elements *= n
+            flops = total_elements * 2
+            memory = dtype_bytes * total_elements * 2
+    return flops, memory, dtype_bytes
+def predict_runtime(code_text, selected_gpus, model_choice="Ensemble"):
+    """Predict runtime for code on selected GPUs."""
+    if not code_text.strip():
+        return "⚠️ Please paste some code.", None
+    if not selected_gpus:
+        return "⚠️ Please select at least one GPU.", None
+    # Extract code features
+    code_feats = extract_code_features(code_text)
+    code_feat_names = sorted(code_feats.keys())
+    code_feat_vec = [code_feats[k] for k in code_feat_names]
+    # Estimate FLOPs and memory
+    flops, memory_bytes, dtype_bytes = estimate_flops_and_memory(code_text)
+    arithmetic_intensity = flops / max(memory_bytes, 1)
+    results = []
+    for gpu_key in selected_gpus:
+        gpu_spec = GPU_CATALOG.get(gpu_key)
+        if gpu_spec is None:
+            continue
+        # GPU features
+        gpu_feat_vec = [gpu_spec[col] for col in GPU_FEATURE_COLS]
+        # Extra features
+        extra_feats = [np.log1p(flops), np.log1p(memory_bytes), arithmetic_intensity, dtype_bytes]
+        # Combine
+        all_feats = np.array(code_feat_vec + gpu_feat_vec + extra_feats, dtype=np.float32).reshape(1, -1)
+        # Normalize
+        all_feats_scaled = scaler_X.transform(all_feats)
+        all_feats_scaled = np.nan_to_num(all_feats_scaled, nan=0.0, posinf=0.0, neginf=0.0)
+        # Predict
+        if model_choice == "GBR":
+            pred_log = model_gbr.predict(all_feats_scaled)[0]
+        elif model_choice == "Random Forest":
+            pred_log = model_rf.predict(all_feats_scaled)[0]
+        elif model_choice == "Neural Net":
+            with torch.no_grad():
+                pred_log = model_nn(torch.tensor(all_feats_scaled, dtype=torch.float32)).item()
+        else:  # Ensemble
+            pred_gbr = model_gbr.predict(all_feats_scaled)[0]
+            pred_rf = model_rf.predict(all_feats_scaled)[0]
+            with torch.no_grad():
+                pred_nn = model_nn(torch.tensor(all_feats_scaled, dtype=torch.float32)).item()
+            pred_log = 0.5 * pred_gbr + 0.3 * pred_rf + 0.2 * pred_nn
+        runtime_ms = np.expm1(pred_log)
+        runtime_ms = max(runtime_ms, 0.001)
+        results.append({
+            'GPU': gpu_spec['name'],
+            'Runtime (ms)': round(runtime_ms, 4),
+            'FP32 TFLOPS': gpu_spec['fp32_tflops'],
+            'Mem BW (GB/s)': gpu_spec['memory_bandwidth_gbps'],
+            'VRAM (GB)': gpu_spec['memory_gb'],
+            'Relative Speed': None,
+        })
+    if not results:
+        return "⚠️ No valid GPUs selected.", None
+    # Sort by runtime
+    results.sort(key=lambda x: x['Runtime (ms)'])
+    # Calculate relative speed (fastest = 1.0x)
+    fastest = results[0]['Runtime (ms)']
+    for r in results:
+        r['Relative Speed'] = f"{r['Runtime (ms)'] / fastest:.2f}x"
+    # Format output
+    df_results = pd.DataFrame(results)
+    # Summary text
+    summary = f"### 🏆 Fastest: **{results[0]['GPU']}** ({results[0]['Runtime (ms)']:.4f} ms)\n"
+    summary += f"### 🐢 Slowest: **{results[-1]['GPU']}** ({results[-1]['Runtime (ms)']:.4f} ms)\n"
+    summary += f"### ⚡ Speedup: **{results[-1]['Runtime (ms)']/results[0]['Runtime (ms)']:.1f}x** (fastest vs slowest)\n\n"
+    summary += f"**Estimated FLOPs:** {flops:,.0f}\n\n"
+    summary += f"**Estimated Memory:** {memory_bytes:,.0f} bytes\n\n"
+    summary += f"**Arithmetic Intensity:** {arithmetic_intensity:.2f} FLOP/byte\n\n"
+    if arithmetic_intensity > 10:
+        summary += "🔥 **Compute-bound** workload — faster GPUs with more TFLOPS will help most"
+    else:
+        summary += "💾 **Memory-bound** workload — GPUs with higher memory bandwidth will help most"
+    return summary, df_results
+# ============================================================================
+# EXAMPLE CODES
+# ============================================================================
+EXAMPLE_CODES = {
+    "Matrix Multiplication (2048x2048)": """import torch
+def matmul_kernel(A, B):
+    # Matrix multiplication: (2048, 2048) x (2048, 2048) -> (2048, 2048)
+    C = torch.matmul(A, B)
+    return C
+A = torch.randn(2048, 2048, dtype=torch.float32, device='cuda')
+B = torch.randn(2048, 2048, dtype=torch.float32, device='cuda')
+C = matmul_kernel(A, B)
+torch.cuda.synchronize()""",
+    "Self-Attention (batch=8, seq=1024)": """import torch
+import torch.nn.functional as F
+def self_attention(Q, K, V, num_heads=16):
+    B, S, D = Q.shape
+    head_dim = D // num_heads
+    Q = Q.view(B, S, num_heads, head_dim).transpose(1, 2)
+    K = K.view(B, S, num_heads, head_dim).transpose(1, 2)
+    V = V.view(B, S, num_heads, head_dim).transpose(1, 2)
+    attn = torch.matmul(Q, K.transpose(-2, -1)) / (head_dim ** 0.5)
+    attn = F.softmax(attn, dim=-1)
+    out = torch.matmul(attn, V)
+    return out.transpose(1, 2).contiguous().view(B, S, D)
+hidden_dim = 1024
+Q = torch.randn(8, 1024, hidden_dim, dtype=torch.float32, device='cuda')
+K = torch.randn(8, 1024, hidden_dim, dtype=torch.float32, device='cuda')
+V = torch.randn(8, 1024, hidden_dim, dtype=torch.float32, device='cuda')
+out = self_attention(Q, K, V)
+torch.cuda.synchronize()""",
+    "Conv2D ResNet Block": """import torch
+import torch.nn as nn
+def conv2d_forward(x, conv):
+    # Conv2D: batch=16, in_channels=256, out_channels=512
+    # Input: (16, 256, 56, 56), Kernel: 3x3
+    return conv(x)
+conv = nn.Conv2d(256, 512, kernel_size=3, padding=1).to('cuda')
+x = torch.randn(16, 256, 56, 56, dtype=torch.float32, device='cuda')
+out = conv2d_forward(x, conv)
+torch.cuda.synchronize()""",
+    "Transformer Block": """import torch
+import torch.nn as nn
+class TransformerBlock(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.attn = nn.MultiheadAttention(768, 12, batch_first=True)
+        self.ff = nn.Sequential(
+            nn.Linear(768, 3072),
+            nn.GELU(),
+            nn.Linear(3072, 768)
+        )
+        self.ln1 = nn.LayerNorm(768)
+        self.ln2 = nn.LayerNorm(768)
+    def forward(self, x):
+        attn_out, _ = self.attn(self.ln1(x), self.ln1(x), self.ln1(x))
+        x = x + attn_out
+        x = x + self.ff(self.ln2(x))
+        return x
+block = TransformerBlock().to('cuda')
+x = torch.randn(8, 512, 768, dtype=torch.float32, device='cuda')
+out = block(x)
+torch.cuda.synchronize()""",
+    "Elementwise GELU (100M elements)": """import torch
+def elementwise_op(x):
+    # Elementwise gelu on tensor of size 100000000
+    return torch.nn.functional.gelu(x)
+x = torch.randn(100000000, dtype=torch.float32, device='cuda')
+out = elementwise_op(x)
+torch.cuda.synchronize()""",
+    "LLM Linear Layer (fp16, vocab=50257)": """import torch
+import torch.nn as nn
+def linear_forward(x, linear):
+    # Linear layer: (32, 4096) -> (32, 50257)
+    return linear(x)
+linear = nn.Linear(4096, 50257).to('cuda')
+x = torch.randn(32, 4096, dtype=torch.float16, device='cuda')
+out = linear_forward(x, linear)
+torch.cuda.synchronize()""",
+}
+# ============================================================================
+# GRADIO UI
+# ============================================================================
+gpu_choices = list(GPU_CATALOG.keys())
+gpu_display_names = {k: v['name'] for k, v in GPU_CATALOG.items()}
+def load_example(example_name):
+    return EXAMPLE_CODES.get(example_name, "")
+with gr.Blocks(
+    title="GPU Runtime Predictor",
+    theme=gr.themes.Soft(),
+) as demo:
+    gr.Markdown("""
+    # ⚡ GPU Runtime Predictor
+    Predict how fast your PyTorch/CUDA code will run on different GPU hardware.
+    Paste your code, select GPUs from the catalog, and get instant runtime estimates.
+    **Model**: Ensemble of GBR + Random Forest + Neural Network | **R² = 0.993** | **12 GPUs** | **15 workload types**
+    ---
+    """)
+    with gr.Row():
+        with gr.Column(scale=3):
+            example_dropdown = gr.Dropdown(
+                choices=list(EXAMPLE_CODES.keys()),
+                label="📝 Load Example Code",
+                value=None,
+                interactive=True,
+            )
+            code_input = gr.Code(
+                label="Your PyTorch/CUDA Code",
+                language="python",
+                lines=20,
+                value=EXAMPLE_CODES["Matrix Multiplication (2048x2048)"],
+            )
+        with gr.Column(scale=2):
+            gpu_selector = gr.CheckboxGroup(
+                choices=[(gpu_display_names[k], k) for k in gpu_choices],
+                value=list(GPU_CATALOG.keys()),
+                label="🖥️ Select GPUs to Compare",
+            )
+            model_selector = gr.Radio(
+                choices=["Ensemble", "GBR", "Random Forest", "Neural Net"],
+                value="Ensemble",
+                label="🤖 Prediction Model",
+            )
+            predict_btn = gr.Button("⚡ Predict Runtime", variant="primary", size="lg")
+    gr.Markdown("---")
+    with gr.Row():
+        with gr.Column():
+            summary_output = gr.Markdown(label="Summary")
+    with gr.Row():
+        results_table = gr.DataFrame(
+            label="📊 Runtime Predictions (sorted fastest → slowest)",
+            interactive=False,
+        )
+    gr.Markdown("""
+    ---
+    ### ℹ️ How It Works
+    1. **Code Analysis**: Extracts 48 features from your code (tensor dimensions, operation types, complexity indicators)
+    2. **GPU Encoding**: Uses 12 hardware specs for each GPU (CUDA cores, memory bandwidth, TFLOPS, etc.)
+    3. **ML Prediction**: Ensemble predicts `log(runtime_ms)` → converted back to milliseconds
+    **Powered by**: [Training Dataset](https://huggingface.co/datasets/RajBhope/gpu-runtime-prediction-dataset) | [Model](https://huggingface.co/RajBhope/gpu-runtime-predictor)
+    *Runtimes are estimates based on a roofline performance model. Actual runtimes may vary based on driver version, CUDA toolkit, memory state, and other factors.*
+    """)
+    # Event handlers
+    example_dropdown.change(
+        fn=load_example,
+        inputs=[example_dropdown],
+        outputs=[code_input],
+    )
+    predict_btn.click(
+        fn=predict_runtime,
+        inputs=[code_input, gpu_selector, model_selector],
+        outputs=[summary_output, results_table],
+    )
+demo.launch()