Spaces:
Sleeping
Sleeping
| """ | |
| GPU Runtime Predictor - Gradio Space | |
| ===================================== | |
| Paste your PyTorch/CUDA code, select GPUs from the catalog, | |
| and get predicted runtimes for each GPU. | |
| """ | |
| import gradio as gr | |
| import numpy as np | |
| import pandas as pd | |
| import torch | |
| import torch.nn as nn | |
| import json | |
| import re | |
| import pickle | |
| import os | |
| from huggingface_hub import hf_hub_download | |
| # ============================================================================ | |
| # LOAD MODEL ARTIFACTS | |
| # ============================================================================ | |
| MODEL_REPO = "RajBhope/gpu-runtime-predictor" | |
| def download_artifacts(): | |
| """Download all model artifacts from Hub.""" | |
| files = ['model_gbr.pkl', 'model_rf.pkl', 'model_nn.pt', 'scaler_X.pkl', | |
| 'scaler_params.json', 'gpu_catalog.json', 'nn_config.json', 'metrics.json'] | |
| paths = {} | |
| for f in files: | |
| paths[f] = hf_hub_download(repo_id=MODEL_REPO, filename=f) | |
| return paths | |
| print("Downloading model artifacts...") | |
| artifact_paths = download_artifacts() | |
| # Load models | |
| with open(artifact_paths['model_gbr.pkl'], 'rb') as f: | |
| model_gbr = pickle.load(f) | |
| with open(artifact_paths['model_rf.pkl'], 'rb') as f: | |
| model_rf = pickle.load(f) | |
| with open(artifact_paths['scaler_X.pkl'], 'rb') as f: | |
| scaler_X = pickle.load(f) | |
| with open(artifact_paths['scaler_params.json'], 'r') as f: | |
| scaler_params = json.load(f) | |
| with open(artifact_paths['gpu_catalog.json'], 'r') as f: | |
| GPU_CATALOG = json.load(f) | |
| with open(artifact_paths['nn_config.json'], 'r') as f: | |
| nn_config = json.load(f) | |
| with open(artifact_paths['metrics.json'], 'r') as f: | |
| metrics = json.load(f) | |
| # Load NN model | |
| class RuntimeMLP(nn.Module): | |
| def __init__(self, input_dim, hidden_dims=[512, 256, 128], dropout=0.15): | |
| super().__init__() | |
| layers = [] | |
| prev_dim = input_dim | |
| for h_dim in hidden_dims: | |
| layers.extend([ | |
| nn.Linear(prev_dim, h_dim), | |
| nn.LayerNorm(h_dim), | |
| nn.GELU(), | |
| nn.Dropout(dropout), | |
| ]) | |
| prev_dim = h_dim | |
| layers.append(nn.Linear(prev_dim, 1)) | |
| self.net = nn.Sequential(*layers) | |
| def forward(self, x): | |
| return self.net(x).squeeze(-1) | |
| model_nn = RuntimeMLP(**nn_config) | |
| model_nn.load_state_dict(torch.load(artifact_paths['model_nn.pt'], map_location='cpu', weights_only=True)) | |
| model_nn.eval() | |
| GPU_FEATURE_COLS = [ | |
| 'cuda_cores', 'tensor_cores', 'memory_gb', 'memory_bandwidth_gbps', | |
| 'base_clock_mhz', 'boost_clock_mhz', 'sm_count', 'fp32_tflops', | |
| 'fp16_tflops', 'tdp_watts', 'compute_capability', 'l2_cache_mb', | |
| ] | |
| print("Models loaded!") | |
| # ============================================================================ | |
| # CODE FEATURE EXTRACTION | |
| # ============================================================================ | |
| def extract_code_features(code_text): | |
| """Extract features from source code text.""" | |
| features = {} | |
| lines = code_text.strip().split('\n') | |
| features['num_lines'] = len(lines) | |
| features['num_chars'] = len(code_text) | |
| features['avg_line_length'] = np.mean([len(l) for l in lines]) if lines else 0 | |
| tokens = re.findall(r'[a-zA-Z_]\w*|[0-9]+\.?[0-9]*', code_text) | |
| features['num_tokens'] = len(tokens) | |
| numbers = re.findall(r'\b(\d+\.?\d*)\b', code_text) | |
| nums = [float(n) for n in numbers if n] | |
| features['num_numeric_literals'] = len(nums) | |
| features['max_numeric'] = max(nums) if nums else 0 | |
| features['min_numeric'] = min(nums) if nums else 0 | |
| features['mean_numeric'] = np.mean(nums) if nums else 0 | |
| features['sum_numeric_log'] = np.log1p(sum(nums)) if nums else 0 | |
| large_nums = [n for n in nums if n >= 64] | |
| features['num_large_dims'] = len(large_nums) | |
| features['product_large_dims_log'] = np.log1p(np.prod(large_nums[:5])) if large_nums else 0 | |
| pytorch_ops = { | |
| 'matmul': r'torch\.matmul|torch\.mm|@', | |
| 'conv': r'Conv[12]d|conv[12]d', | |
| 'attention': r'attention|Attention|MultiheadAttention|softmax.*matmul', | |
| 'linear': r'nn\.Linear|linear', | |
| 'batchnorm': r'BatchNorm|batchnorm', | |
| 'layernorm': r'LayerNorm|layernorm', | |
| 'softmax': r'softmax|Softmax', | |
| 'relu': r'relu|ReLU', | |
| 'gelu': r'gelu|GELU', | |
| 'sigmoid': r'sigmoid|Sigmoid', | |
| 'tanh': r'tanh|Tanh', | |
| 'dropout': r'Dropout|dropout', | |
| 'embedding': r'Embedding|embedding', | |
| 'pooling': r'Pool|pool|MaxPool|AvgPool', | |
| 'fft': r'fft|FFT', | |
| 'sort': r'torch\.sort', | |
| 'backward': r'backward|grad', | |
| 'loss': r'Loss|loss|CrossEntropy', | |
| 'cat': r'torch\.cat|concatenate', | |
| 'reshape': r'reshape|view|contiguous', | |
| 'transpose': r'transpose|\.t\(\)|permute', | |
| 'reduce': r'torch\.sum|torch\.mean|torch\.max|torch\.min|reduce', | |
| } | |
| for op_name, pattern in pytorch_ops.items(): | |
| features[f'has_{op_name}'] = 1 if re.search(pattern, code_text) else 0 | |
| features['uses_float16'] = 1 if re.search(r'float16|half|fp16', code_text) else 0 | |
| features['uses_float32'] = 1 if re.search(r'float32|float(?!16)', code_text) else 0 | |
| features['uses_cuda'] = 1 if re.search(r"'cuda'|\.cuda\(\)|device='cuda'", code_text) else 0 | |
| features['num_for_loops'] = len(re.findall(r'\bfor\b', code_text)) | |
| features['num_function_defs'] = len(re.findall(r'\bdef\b', code_text)) | |
| features['num_class_defs'] = len(re.findall(r'\bclass\b', code_text)) | |
| features['num_imports'] = len(re.findall(r'\bimport\b', code_text)) | |
| features['num_torch_calls'] = len(re.findall(r'torch\.', code_text)) | |
| features['num_nn_calls'] = len(re.findall(r'nn\.', code_text)) | |
| dim_patterns = [r'\((\d+),\s*(\d+)\)', r'\((\d+),\s*(\d+),\s*(\d+)\)', r'\((\d+),\s*(\d+),\s*(\d+),\s*(\d+)\)'] | |
| all_dims = [] | |
| for pattern in dim_patterns: | |
| for match in re.finditer(pattern, code_text): | |
| dims = [int(g) for g in match.groups()] | |
| all_dims.extend(dims) | |
| features['num_dim_specs'] = len(all_dims) | |
| features['max_dim'] = max(all_dims) if all_dims else 0 | |
| features['total_elements_log'] = 0 | |
| if all_dims: | |
| tuples = re.findall(r'\([\d,\s]+\)', code_text) | |
| for t in tuples: | |
| dims = [int(d) for d in re.findall(r'\d+', t)] | |
| if len(dims) >= 2: | |
| prod = 1 | |
| for d in dims: | |
| prod *= d | |
| features['total_elements_log'] = max(features['total_elements_log'], np.log1p(prod)) | |
| features['compute_bound_score'] = features.get('has_matmul', 0) + features.get('has_conv', 0) + features.get('has_linear', 0) | |
| features['memory_bound_score'] = features.get('has_embedding', 0) + features.get('has_cat', 0) + features.get('has_transpose', 0) + features.get('has_relu', 0) | |
| features['mixed_score'] = features.get('has_attention', 0) + features.get('has_batchnorm', 0) + features.get('has_layernorm', 0) | |
| return features | |
| def estimate_flops_and_memory(code_text): | |
| """Heuristic estimate of FLOPs and memory bytes from code.""" | |
| numbers = re.findall(r'\b(\d+)\b', code_text) | |
| nums = [int(n) for n in numbers if int(n) > 0] | |
| # Detect dtype | |
| dtype_bytes = 2 if re.search(r'float16|half', code_text) else 4 | |
| # Try to identify tensor dimensions for FLOPs estimation | |
| flops = 0 | |
| memory = 0 | |
| # Matrix multiplication: look for matmul patterns | |
| if re.search(r'matmul|torch\.mm|@', code_text): | |
| dims = [n for n in nums if n >= 8] | |
| if len(dims) >= 3: | |
| M, K, N = dims[0], dims[1], dims[2] if len(dims) > 2 else dims[1] | |
| flops = 2 * M * N * K | |
| memory = dtype_bytes * (M*K + K*N + M*N) | |
| # Conv2D | |
| elif re.search(r'Conv[12]d', code_text): | |
| dims = [n for n in nums if n >= 1] | |
| if len(dims) >= 5: | |
| batch, in_ch, out_ch = dims[0], dims[1], dims[2] | |
| H = W = dims[3] if len(dims) > 3 else 56 | |
| ks = dims[4] if len(dims) > 4 else 3 | |
| flops = 2 * batch * out_ch * H * W * in_ch * ks * ks | |
| memory = dtype_bytes * (batch*in_ch*H*W + out_ch*in_ch*ks*ks + batch*out_ch*H*W) | |
| # Attention | |
| elif re.search(r'attention|Attention', code_text): | |
| dims = [n for n in nums if n >= 4] | |
| if len(dims) >= 3: | |
| batch, seq_len, hidden = dims[0], dims[1], dims[2] | |
| flops = 4 * batch * seq_len * seq_len * hidden | |
| memory = dtype_bytes * batch * 3 * seq_len * hidden * 2 | |
| # Linear | |
| elif re.search(r'nn\.Linear', code_text): | |
| dims = [n for n in nums if n >= 8] | |
| if len(dims) >= 2: | |
| in_f, out_f = dims[0], dims[1] | |
| batch = dims[2] if len(dims) > 2 else 1 | |
| flops = 2 * batch * in_f * out_f | |
| memory = dtype_bytes * (batch * in_f + in_f * out_f + batch * out_f) | |
| # Generic fallback: estimate from tensor sizes | |
| if flops == 0: | |
| large_nums = sorted([n for n in nums if n >= 32], reverse=True)[:4] | |
| if large_nums: | |
| total_elements = 1 | |
| for n in large_nums: | |
| total_elements *= n | |
| flops = total_elements * 2 | |
| memory = dtype_bytes * total_elements * 2 | |
| return flops, memory, dtype_bytes | |
| def predict_runtime(code_text, selected_gpus, model_choice="Ensemble"): | |
| """Predict runtime for code on selected GPUs.""" | |
| if not code_text.strip(): | |
| return "⚠️ Please paste some code.", None | |
| if not selected_gpus: | |
| return "⚠️ Please select at least one GPU.", None | |
| # Extract code features | |
| code_feats = extract_code_features(code_text) | |
| code_feat_names = sorted(code_feats.keys()) | |
| code_feat_vec = [code_feats[k] for k in code_feat_names] | |
| # Estimate FLOPs and memory | |
| flops, memory_bytes, dtype_bytes = estimate_flops_and_memory(code_text) | |
| arithmetic_intensity = flops / max(memory_bytes, 1) | |
| results = [] | |
| for gpu_key in selected_gpus: | |
| gpu_spec = GPU_CATALOG.get(gpu_key) | |
| if gpu_spec is None: | |
| continue | |
| # GPU features | |
| gpu_feat_vec = [gpu_spec[col] for col in GPU_FEATURE_COLS] | |
| # Extra features | |
| extra_feats = [np.log1p(flops), np.log1p(memory_bytes), arithmetic_intensity, dtype_bytes] | |
| # Combine | |
| all_feats = np.array(code_feat_vec + gpu_feat_vec + extra_feats, dtype=np.float32).reshape(1, -1) | |
| # Normalize | |
| all_feats_scaled = scaler_X.transform(all_feats) | |
| all_feats_scaled = np.nan_to_num(all_feats_scaled, nan=0.0, posinf=0.0, neginf=0.0) | |
| # Predict | |
| if model_choice == "GBR": | |
| pred_log = model_gbr.predict(all_feats_scaled)[0] | |
| elif model_choice == "Random Forest": | |
| pred_log = model_rf.predict(all_feats_scaled)[0] | |
| elif model_choice == "Neural Net": | |
| with torch.no_grad(): | |
| pred_log = model_nn(torch.tensor(all_feats_scaled, dtype=torch.float32)).item() | |
| else: # Ensemble | |
| pred_gbr = model_gbr.predict(all_feats_scaled)[0] | |
| pred_rf = model_rf.predict(all_feats_scaled)[0] | |
| with torch.no_grad(): | |
| pred_nn = model_nn(torch.tensor(all_feats_scaled, dtype=torch.float32)).item() | |
| pred_log = 0.5 * pred_gbr + 0.3 * pred_rf + 0.2 * pred_nn | |
| runtime_ms = np.expm1(pred_log) | |
| runtime_ms = max(runtime_ms, 0.001) | |
| results.append({ | |
| 'GPU': gpu_spec['name'], | |
| 'Runtime (ms)': round(runtime_ms, 4), | |
| 'FP32 TFLOPS': gpu_spec['fp32_tflops'], | |
| 'Mem BW (GB/s)': gpu_spec['memory_bandwidth_gbps'], | |
| 'VRAM (GB)': gpu_spec['memory_gb'], | |
| 'Relative Speed': None, | |
| }) | |
| if not results: | |
| return "⚠️ No valid GPUs selected.", None | |
| # Sort by runtime | |
| results.sort(key=lambda x: x['Runtime (ms)']) | |
| # Calculate relative speed (fastest = 1.0x) | |
| fastest = results[0]['Runtime (ms)'] | |
| for r in results: | |
| r['Relative Speed'] = f"{r['Runtime (ms)'] / fastest:.2f}x" | |
| # Format output | |
| df_results = pd.DataFrame(results) | |
| # Summary text | |
| summary = f"### 🏆 Fastest: **{results[0]['GPU']}** ({results[0]['Runtime (ms)']:.4f} ms)\n" | |
| summary += f"### 🐢 Slowest: **{results[-1]['GPU']}** ({results[-1]['Runtime (ms)']:.4f} ms)\n" | |
| summary += f"### ⚡ Speedup: **{results[-1]['Runtime (ms)']/results[0]['Runtime (ms)']:.1f}x** (fastest vs slowest)\n\n" | |
| summary += f"**Estimated FLOPs:** {flops:,.0f}\n\n" | |
| summary += f"**Estimated Memory:** {memory_bytes:,.0f} bytes\n\n" | |
| summary += f"**Arithmetic Intensity:** {arithmetic_intensity:.2f} FLOP/byte\n\n" | |
| if arithmetic_intensity > 10: | |
| summary += "🔥 **Compute-bound** workload — faster GPUs with more TFLOPS will help most" | |
| else: | |
| summary += "💾 **Memory-bound** workload — GPUs with higher memory bandwidth will help most" | |
| return summary, df_results | |
| # ============================================================================ | |
| # EXAMPLE CODES | |
| # ============================================================================ | |
| EXAMPLE_CODES = { | |
| "Matrix Multiplication (2048x2048)": """import torch | |
| def matmul_kernel(A, B): | |
| # Matrix multiplication: (2048, 2048) x (2048, 2048) -> (2048, 2048) | |
| C = torch.matmul(A, B) | |
| return C | |
| A = torch.randn(2048, 2048, dtype=torch.float32, device='cuda') | |
| B = torch.randn(2048, 2048, dtype=torch.float32, device='cuda') | |
| C = matmul_kernel(A, B) | |
| torch.cuda.synchronize()""", | |
| "Self-Attention (batch=8, seq=1024)": """import torch | |
| import torch.nn.functional as F | |
| def self_attention(Q, K, V, num_heads=16): | |
| B, S, D = Q.shape | |
| head_dim = D // num_heads | |
| Q = Q.view(B, S, num_heads, head_dim).transpose(1, 2) | |
| K = K.view(B, S, num_heads, head_dim).transpose(1, 2) | |
| V = V.view(B, S, num_heads, head_dim).transpose(1, 2) | |
| attn = torch.matmul(Q, K.transpose(-2, -1)) / (head_dim ** 0.5) | |
| attn = F.softmax(attn, dim=-1) | |
| out = torch.matmul(attn, V) | |
| return out.transpose(1, 2).contiguous().view(B, S, D) | |
| hidden_dim = 1024 | |
| Q = torch.randn(8, 1024, hidden_dim, dtype=torch.float32, device='cuda') | |
| K = torch.randn(8, 1024, hidden_dim, dtype=torch.float32, device='cuda') | |
| V = torch.randn(8, 1024, hidden_dim, dtype=torch.float32, device='cuda') | |
| out = self_attention(Q, K, V) | |
| torch.cuda.synchronize()""", | |
| "Conv2D ResNet Block": """import torch | |
| import torch.nn as nn | |
| def conv2d_forward(x, conv): | |
| # Conv2D: batch=16, in_channels=256, out_channels=512 | |
| # Input: (16, 256, 56, 56), Kernel: 3x3 | |
| return conv(x) | |
| conv = nn.Conv2d(256, 512, kernel_size=3, padding=1).to('cuda') | |
| x = torch.randn(16, 256, 56, 56, dtype=torch.float32, device='cuda') | |
| out = conv2d_forward(x, conv) | |
| torch.cuda.synchronize()""", | |
| "Transformer Block": """import torch | |
| import torch.nn as nn | |
| class TransformerBlock(nn.Module): | |
| def __init__(self): | |
| super().__init__() | |
| self.attn = nn.MultiheadAttention(768, 12, batch_first=True) | |
| self.ff = nn.Sequential( | |
| nn.Linear(768, 3072), | |
| nn.GELU(), | |
| nn.Linear(3072, 768) | |
| ) | |
| self.ln1 = nn.LayerNorm(768) | |
| self.ln2 = nn.LayerNorm(768) | |
| def forward(self, x): | |
| attn_out, _ = self.attn(self.ln1(x), self.ln1(x), self.ln1(x)) | |
| x = x + attn_out | |
| x = x + self.ff(self.ln2(x)) | |
| return x | |
| block = TransformerBlock().to('cuda') | |
| x = torch.randn(8, 512, 768, dtype=torch.float32, device='cuda') | |
| out = block(x) | |
| torch.cuda.synchronize()""", | |
| "Elementwise GELU (100M elements)": """import torch | |
| def elementwise_op(x): | |
| # Elementwise gelu on tensor of size 100000000 | |
| return torch.nn.functional.gelu(x) | |
| x = torch.randn(100000000, dtype=torch.float32, device='cuda') | |
| out = elementwise_op(x) | |
| torch.cuda.synchronize()""", | |
| "LLM Linear Layer (fp16, vocab=50257)": """import torch | |
| import torch.nn as nn | |
| def linear_forward(x, linear): | |
| # Linear layer: (32, 4096) -> (32, 50257) | |
| return linear(x) | |
| linear = nn.Linear(4096, 50257).to('cuda') | |
| x = torch.randn(32, 4096, dtype=torch.float16, device='cuda') | |
| out = linear_forward(x, linear) | |
| torch.cuda.synchronize()""", | |
| } | |
| # ============================================================================ | |
| # GRADIO UI | |
| # ============================================================================ | |
| gpu_choices = list(GPU_CATALOG.keys()) | |
| gpu_display_names = {k: v['name'] for k, v in GPU_CATALOG.items()} | |
| def load_example(example_name): | |
| return EXAMPLE_CODES.get(example_name, "") | |
| with gr.Blocks( | |
| title="GPU Runtime Predictor", | |
| theme=gr.themes.Soft(), | |
| ) as demo: | |
| gr.Markdown(""" | |
| # ⚡ GPU Runtime Predictor | |
| Predict how fast your PyTorch/CUDA code will run on different GPU hardware. | |
| Paste your code, select GPUs from the catalog, and get instant runtime estimates. | |
| **Model**: Ensemble of GBR + Random Forest + Neural Network | **R² = 0.993** | **12 GPUs** | **15 workload types** | |
| --- | |
| """) | |
| with gr.Row(): | |
| with gr.Column(scale=3): | |
| example_dropdown = gr.Dropdown( | |
| choices=list(EXAMPLE_CODES.keys()), | |
| label="📝 Load Example Code", | |
| value=None, | |
| interactive=True, | |
| ) | |
| code_input = gr.Code( | |
| label="Your PyTorch/CUDA Code", | |
| language="python", | |
| lines=20, | |
| value=EXAMPLE_CODES["Matrix Multiplication (2048x2048)"], | |
| ) | |
| with gr.Column(scale=2): | |
| gpu_selector = gr.CheckboxGroup( | |
| choices=[(gpu_display_names[k], k) for k in gpu_choices], | |
| value=list(GPU_CATALOG.keys()), | |
| label="🖥️ Select GPUs to Compare", | |
| ) | |
| model_selector = gr.Radio( | |
| choices=["Ensemble", "GBR", "Random Forest", "Neural Net"], | |
| value="Ensemble", | |
| label="🤖 Prediction Model", | |
| ) | |
| predict_btn = gr.Button("⚡ Predict Runtime", variant="primary", size="lg") | |
| gr.Markdown("---") | |
| with gr.Row(): | |
| with gr.Column(): | |
| summary_output = gr.Markdown(label="Summary") | |
| with gr.Row(): | |
| results_table = gr.DataFrame( | |
| label="📊 Runtime Predictions (sorted fastest → slowest)", | |
| interactive=False, | |
| ) | |
| gr.Markdown(""" | |
| --- | |
| ### ℹ️ How It Works | |
| 1. **Code Analysis**: Extracts 48 features from your code (tensor dimensions, operation types, complexity indicators) | |
| 2. **GPU Encoding**: Uses 12 hardware specs for each GPU (CUDA cores, memory bandwidth, TFLOPS, etc.) | |
| 3. **ML Prediction**: Ensemble predicts `log(runtime_ms)` → converted back to milliseconds | |
| **Powered by**: [Training Dataset](https://huggingface.co/datasets/RajBhope/gpu-runtime-prediction-dataset) | [Model](https://huggingface.co/RajBhope/gpu-runtime-predictor) | |
| *Runtimes are estimates based on a roofline performance model. Actual runtimes may vary based on driver version, CUDA toolkit, memory state, and other factors.* | |
| """) | |
| # Event handlers | |
| example_dropdown.change( | |
| fn=load_example, | |
| inputs=[example_dropdown], | |
| outputs=[code_input], | |
| ) | |
| predict_btn.click( | |
| fn=predict_runtime, | |
| inputs=[code_input, gpu_selector, model_selector], | |
| outputs=[summary_output, results_table], | |
| ) | |
| demo.launch() | |