RajBhope commited on
Commit
9ca80ec
·
verified ·
1 Parent(s): ee9c452

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +542 -0
app.py ADDED
@@ -0,0 +1,542 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ GPU Runtime Predictor - Gradio Space
3
+ =====================================
4
+ Paste your PyTorch/CUDA code, select GPUs from the catalog,
5
+ and get predicted runtimes for each GPU.
6
+ """
7
+
8
+ import gradio as gr
9
+ import numpy as np
10
+ import pandas as pd
11
+ import torch
12
+ import torch.nn as nn
13
+ import json
14
+ import re
15
+ import pickle
16
+ import os
17
+ from huggingface_hub import hf_hub_download
18
+
19
+ # ============================================================================
20
+ # LOAD MODEL ARTIFACTS
21
+ # ============================================================================
22
+ MODEL_REPO = "RajBhope/gpu-runtime-predictor"
23
+
24
+ def download_artifacts():
25
+ """Download all model artifacts from Hub."""
26
+ files = ['model_gbr.pkl', 'model_rf.pkl', 'model_nn.pt', 'scaler_X.pkl',
27
+ 'scaler_params.json', 'gpu_catalog.json', 'nn_config.json', 'metrics.json']
28
+ paths = {}
29
+ for f in files:
30
+ paths[f] = hf_hub_download(repo_id=MODEL_REPO, filename=f)
31
+ return paths
32
+
33
+ print("Downloading model artifacts...")
34
+ artifact_paths = download_artifacts()
35
+
36
+ # Load models
37
+ with open(artifact_paths['model_gbr.pkl'], 'rb') as f:
38
+ model_gbr = pickle.load(f)
39
+
40
+ with open(artifact_paths['model_rf.pkl'], 'rb') as f:
41
+ model_rf = pickle.load(f)
42
+
43
+ with open(artifact_paths['scaler_X.pkl'], 'rb') as f:
44
+ scaler_X = pickle.load(f)
45
+
46
+ with open(artifact_paths['scaler_params.json'], 'r') as f:
47
+ scaler_params = json.load(f)
48
+
49
+ with open(artifact_paths['gpu_catalog.json'], 'r') as f:
50
+ GPU_CATALOG = json.load(f)
51
+
52
+ with open(artifact_paths['nn_config.json'], 'r') as f:
53
+ nn_config = json.load(f)
54
+
55
+ with open(artifact_paths['metrics.json'], 'r') as f:
56
+ metrics = json.load(f)
57
+
58
+
59
+ # Load NN model
60
+ class RuntimeMLP(nn.Module):
61
+ def __init__(self, input_dim, hidden_dims=[512, 256, 128], dropout=0.15):
62
+ super().__init__()
63
+ layers = []
64
+ prev_dim = input_dim
65
+ for h_dim in hidden_dims:
66
+ layers.extend([
67
+ nn.Linear(prev_dim, h_dim),
68
+ nn.LayerNorm(h_dim),
69
+ nn.GELU(),
70
+ nn.Dropout(dropout),
71
+ ])
72
+ prev_dim = h_dim
73
+ layers.append(nn.Linear(prev_dim, 1))
74
+ self.net = nn.Sequential(*layers)
75
+
76
+ def forward(self, x):
77
+ return self.net(x).squeeze(-1)
78
+
79
+
80
+ model_nn = RuntimeMLP(**nn_config)
81
+ model_nn.load_state_dict(torch.load(artifact_paths['model_nn.pt'], map_location='cpu', weights_only=True))
82
+ model_nn.eval()
83
+
84
+ GPU_FEATURE_COLS = [
85
+ 'cuda_cores', 'tensor_cores', 'memory_gb', 'memory_bandwidth_gbps',
86
+ 'base_clock_mhz', 'boost_clock_mhz', 'sm_count', 'fp32_tflops',
87
+ 'fp16_tflops', 'tdp_watts', 'compute_capability', 'l2_cache_mb',
88
+ ]
89
+
90
+ print("Models loaded!")
91
+
92
+
93
+ # ============================================================================
94
+ # CODE FEATURE EXTRACTION
95
+ # ============================================================================
96
+
97
+ def extract_code_features(code_text):
98
+ """Extract features from source code text."""
99
+ features = {}
100
+
101
+ lines = code_text.strip().split('\n')
102
+ features['num_lines'] = len(lines)
103
+ features['num_chars'] = len(code_text)
104
+ features['avg_line_length'] = np.mean([len(l) for l in lines]) if lines else 0
105
+
106
+ tokens = re.findall(r'[a-zA-Z_]\w*|[0-9]+\.?[0-9]*', code_text)
107
+ features['num_tokens'] = len(tokens)
108
+
109
+ numbers = re.findall(r'\b(\d+\.?\d*)\b', code_text)
110
+ nums = [float(n) for n in numbers if n]
111
+ features['num_numeric_literals'] = len(nums)
112
+ features['max_numeric'] = max(nums) if nums else 0
113
+ features['min_numeric'] = min(nums) if nums else 0
114
+ features['mean_numeric'] = np.mean(nums) if nums else 0
115
+ features['sum_numeric_log'] = np.log1p(sum(nums)) if nums else 0
116
+
117
+ large_nums = [n for n in nums if n >= 64]
118
+ features['num_large_dims'] = len(large_nums)
119
+ features['product_large_dims_log'] = np.log1p(np.prod(large_nums[:5])) if large_nums else 0
120
+
121
+ pytorch_ops = {
122
+ 'matmul': r'torch\.matmul|torch\.mm|@',
123
+ 'conv': r'Conv[12]d|conv[12]d',
124
+ 'attention': r'attention|Attention|MultiheadAttention|softmax.*matmul',
125
+ 'linear': r'nn\.Linear|linear',
126
+ 'batchnorm': r'BatchNorm|batchnorm',
127
+ 'layernorm': r'LayerNorm|layernorm',
128
+ 'softmax': r'softmax|Softmax',
129
+ 'relu': r'relu|ReLU',
130
+ 'gelu': r'gelu|GELU',
131
+ 'sigmoid': r'sigmoid|Sigmoid',
132
+ 'tanh': r'tanh|Tanh',
133
+ 'dropout': r'Dropout|dropout',
134
+ 'embedding': r'Embedding|embedding',
135
+ 'pooling': r'Pool|pool|MaxPool|AvgPool',
136
+ 'fft': r'fft|FFT',
137
+ 'sort': r'torch\.sort',
138
+ 'backward': r'backward|grad',
139
+ 'loss': r'Loss|loss|CrossEntropy',
140
+ 'cat': r'torch\.cat|concatenate',
141
+ 'reshape': r'reshape|view|contiguous',
142
+ 'transpose': r'transpose|\.t\(\)|permute',
143
+ 'reduce': r'torch\.sum|torch\.mean|torch\.max|torch\.min|reduce',
144
+ }
145
+
146
+ for op_name, pattern in pytorch_ops.items():
147
+ features[f'has_{op_name}'] = 1 if re.search(pattern, code_text) else 0
148
+
149
+ features['uses_float16'] = 1 if re.search(r'float16|half|fp16', code_text) else 0
150
+ features['uses_float32'] = 1 if re.search(r'float32|float(?!16)', code_text) else 0
151
+ features['uses_cuda'] = 1 if re.search(r"'cuda'|\.cuda\(\)|device='cuda'", code_text) else 0
152
+
153
+ features['num_for_loops'] = len(re.findall(r'\bfor\b', code_text))
154
+ features['num_function_defs'] = len(re.findall(r'\bdef\b', code_text))
155
+ features['num_class_defs'] = len(re.findall(r'\bclass\b', code_text))
156
+ features['num_imports'] = len(re.findall(r'\bimport\b', code_text))
157
+
158
+ features['num_torch_calls'] = len(re.findall(r'torch\.', code_text))
159
+ features['num_nn_calls'] = len(re.findall(r'nn\.', code_text))
160
+
161
+ dim_patterns = [r'\((\d+),\s*(\d+)\)', r'\((\d+),\s*(\d+),\s*(\d+)\)', r'\((\d+),\s*(\d+),\s*(\d+),\s*(\d+)\)']
162
+ all_dims = []
163
+ for pattern in dim_patterns:
164
+ for match in re.finditer(pattern, code_text):
165
+ dims = [int(g) for g in match.groups()]
166
+ all_dims.extend(dims)
167
+
168
+ features['num_dim_specs'] = len(all_dims)
169
+ features['max_dim'] = max(all_dims) if all_dims else 0
170
+ features['total_elements_log'] = 0
171
+ if all_dims:
172
+ tuples = re.findall(r'\([\d,\s]+\)', code_text)
173
+ for t in tuples:
174
+ dims = [int(d) for d in re.findall(r'\d+', t)]
175
+ if len(dims) >= 2:
176
+ prod = 1
177
+ for d in dims:
178
+ prod *= d
179
+ features['total_elements_log'] = max(features['total_elements_log'], np.log1p(prod))
180
+
181
+ features['compute_bound_score'] = features.get('has_matmul', 0) + features.get('has_conv', 0) + features.get('has_linear', 0)
182
+ features['memory_bound_score'] = features.get('has_embedding', 0) + features.get('has_cat', 0) + features.get('has_transpose', 0) + features.get('has_relu', 0)
183
+ features['mixed_score'] = features.get('has_attention', 0) + features.get('has_batchnorm', 0) + features.get('has_layernorm', 0)
184
+
185
+ return features
186
+
187
+
188
+ def estimate_flops_and_memory(code_text):
189
+ """Heuristic estimate of FLOPs and memory bytes from code."""
190
+ numbers = re.findall(r'\b(\d+)\b', code_text)
191
+ nums = [int(n) for n in numbers if int(n) > 0]
192
+
193
+ # Detect dtype
194
+ dtype_bytes = 2 if re.search(r'float16|half', code_text) else 4
195
+
196
+ # Try to identify tensor dimensions for FLOPs estimation
197
+ flops = 0
198
+ memory = 0
199
+
200
+ # Matrix multiplication: look for matmul patterns
201
+ if re.search(r'matmul|torch\.mm|@', code_text):
202
+ dims = [n for n in nums if n >= 8]
203
+ if len(dims) >= 3:
204
+ M, K, N = dims[0], dims[1], dims[2] if len(dims) > 2 else dims[1]
205
+ flops = 2 * M * N * K
206
+ memory = dtype_bytes * (M*K + K*N + M*N)
207
+
208
+ # Conv2D
209
+ elif re.search(r'Conv[12]d', code_text):
210
+ dims = [n for n in nums if n >= 1]
211
+ if len(dims) >= 5:
212
+ batch, in_ch, out_ch = dims[0], dims[1], dims[2]
213
+ H = W = dims[3] if len(dims) > 3 else 56
214
+ ks = dims[4] if len(dims) > 4 else 3
215
+ flops = 2 * batch * out_ch * H * W * in_ch * ks * ks
216
+ memory = dtype_bytes * (batch*in_ch*H*W + out_ch*in_ch*ks*ks + batch*out_ch*H*W)
217
+
218
+ # Attention
219
+ elif re.search(r'attention|Attention', code_text):
220
+ dims = [n for n in nums if n >= 4]
221
+ if len(dims) >= 3:
222
+ batch, seq_len, hidden = dims[0], dims[1], dims[2]
223
+ flops = 4 * batch * seq_len * seq_len * hidden
224
+ memory = dtype_bytes * batch * 3 * seq_len * hidden * 2
225
+
226
+ # Linear
227
+ elif re.search(r'nn\.Linear', code_text):
228
+ dims = [n for n in nums if n >= 8]
229
+ if len(dims) >= 2:
230
+ in_f, out_f = dims[0], dims[1]
231
+ batch = dims[2] if len(dims) > 2 else 1
232
+ flops = 2 * batch * in_f * out_f
233
+ memory = dtype_bytes * (batch * in_f + in_f * out_f + batch * out_f)
234
+
235
+ # Generic fallback: estimate from tensor sizes
236
+ if flops == 0:
237
+ large_nums = sorted([n for n in nums if n >= 32], reverse=True)[:4]
238
+ if large_nums:
239
+ total_elements = 1
240
+ for n in large_nums:
241
+ total_elements *= n
242
+ flops = total_elements * 2
243
+ memory = dtype_bytes * total_elements * 2
244
+
245
+ return flops, memory, dtype_bytes
246
+
247
+
248
+ def predict_runtime(code_text, selected_gpus, model_choice="Ensemble"):
249
+ """Predict runtime for code on selected GPUs."""
250
+ if not code_text.strip():
251
+ return "⚠️ Please paste some code.", None
252
+
253
+ if not selected_gpus:
254
+ return "⚠️ Please select at least one GPU.", None
255
+
256
+ # Extract code features
257
+ code_feats = extract_code_features(code_text)
258
+ code_feat_names = sorted(code_feats.keys())
259
+ code_feat_vec = [code_feats[k] for k in code_feat_names]
260
+
261
+ # Estimate FLOPs and memory
262
+ flops, memory_bytes, dtype_bytes = estimate_flops_and_memory(code_text)
263
+ arithmetic_intensity = flops / max(memory_bytes, 1)
264
+
265
+ results = []
266
+
267
+ for gpu_key in selected_gpus:
268
+ gpu_spec = GPU_CATALOG.get(gpu_key)
269
+ if gpu_spec is None:
270
+ continue
271
+
272
+ # GPU features
273
+ gpu_feat_vec = [gpu_spec[col] for col in GPU_FEATURE_COLS]
274
+
275
+ # Extra features
276
+ extra_feats = [np.log1p(flops), np.log1p(memory_bytes), arithmetic_intensity, dtype_bytes]
277
+
278
+ # Combine
279
+ all_feats = np.array(code_feat_vec + gpu_feat_vec + extra_feats, dtype=np.float32).reshape(1, -1)
280
+
281
+ # Normalize
282
+ all_feats_scaled = scaler_X.transform(all_feats)
283
+ all_feats_scaled = np.nan_to_num(all_feats_scaled, nan=0.0, posinf=0.0, neginf=0.0)
284
+
285
+ # Predict
286
+ if model_choice == "GBR":
287
+ pred_log = model_gbr.predict(all_feats_scaled)[0]
288
+ elif model_choice == "Random Forest":
289
+ pred_log = model_rf.predict(all_feats_scaled)[0]
290
+ elif model_choice == "Neural Net":
291
+ with torch.no_grad():
292
+ pred_log = model_nn(torch.tensor(all_feats_scaled, dtype=torch.float32)).item()
293
+ else: # Ensemble
294
+ pred_gbr = model_gbr.predict(all_feats_scaled)[0]
295
+ pred_rf = model_rf.predict(all_feats_scaled)[0]
296
+ with torch.no_grad():
297
+ pred_nn = model_nn(torch.tensor(all_feats_scaled, dtype=torch.float32)).item()
298
+ pred_log = 0.5 * pred_gbr + 0.3 * pred_rf + 0.2 * pred_nn
299
+
300
+ runtime_ms = np.expm1(pred_log)
301
+ runtime_ms = max(runtime_ms, 0.001)
302
+
303
+ results.append({
304
+ 'GPU': gpu_spec['name'],
305
+ 'Runtime (ms)': round(runtime_ms, 4),
306
+ 'FP32 TFLOPS': gpu_spec['fp32_tflops'],
307
+ 'Mem BW (GB/s)': gpu_spec['memory_bandwidth_gbps'],
308
+ 'VRAM (GB)': gpu_spec['memory_gb'],
309
+ 'Relative Speed': None,
310
+ })
311
+
312
+ if not results:
313
+ return "⚠️ No valid GPUs selected.", None
314
+
315
+ # Sort by runtime
316
+ results.sort(key=lambda x: x['Runtime (ms)'])
317
+
318
+ # Calculate relative speed (fastest = 1.0x)
319
+ fastest = results[0]['Runtime (ms)']
320
+ for r in results:
321
+ r['Relative Speed'] = f"{r['Runtime (ms)'] / fastest:.2f}x"
322
+
323
+ # Format output
324
+ df_results = pd.DataFrame(results)
325
+
326
+ # Summary text
327
+ summary = f"### 🏆 Fastest: **{results[0]['GPU']}** ({results[0]['Runtime (ms)']:.4f} ms)\n"
328
+ summary += f"### 🐢 Slowest: **{results[-1]['GPU']}** ({results[-1]['Runtime (ms)']:.4f} ms)\n"
329
+ summary += f"### ⚡ Speedup: **{results[-1]['Runtime (ms)']/results[0]['Runtime (ms)']:.1f}x** (fastest vs slowest)\n\n"
330
+
331
+ summary += f"**Estimated FLOPs:** {flops:,.0f}\n\n"
332
+ summary += f"**Estimated Memory:** {memory_bytes:,.0f} bytes\n\n"
333
+ summary += f"**Arithmetic Intensity:** {arithmetic_intensity:.2f} FLOP/byte\n\n"
334
+
335
+ if arithmetic_intensity > 10:
336
+ summary += "🔥 **Compute-bound** workload — faster GPUs with more TFLOPS will help most"
337
+ else:
338
+ summary += "💾 **Memory-bound** workload — GPUs with higher memory bandwidth will help most"
339
+
340
+ return summary, df_results
341
+
342
+
343
+ # ============================================================================
344
+ # EXAMPLE CODES
345
+ # ============================================================================
346
+
347
+ EXAMPLE_CODES = {
348
+ "Matrix Multiplication (2048x2048)": """import torch
349
+
350
+ def matmul_kernel(A, B):
351
+ # Matrix multiplication: (2048, 2048) x (2048, 2048) -> (2048, 2048)
352
+ C = torch.matmul(A, B)
353
+ return C
354
+
355
+ A = torch.randn(2048, 2048, dtype=torch.float32, device='cuda')
356
+ B = torch.randn(2048, 2048, dtype=torch.float32, device='cuda')
357
+ C = matmul_kernel(A, B)
358
+ torch.cuda.synchronize()""",
359
+
360
+ "Self-Attention (batch=8, seq=1024)": """import torch
361
+ import torch.nn.functional as F
362
+
363
+ def self_attention(Q, K, V, num_heads=16):
364
+ B, S, D = Q.shape
365
+ head_dim = D // num_heads
366
+
367
+ Q = Q.view(B, S, num_heads, head_dim).transpose(1, 2)
368
+ K = K.view(B, S, num_heads, head_dim).transpose(1, 2)
369
+ V = V.view(B, S, num_heads, head_dim).transpose(1, 2)
370
+
371
+ attn = torch.matmul(Q, K.transpose(-2, -1)) / (head_dim ** 0.5)
372
+ attn = F.softmax(attn, dim=-1)
373
+ out = torch.matmul(attn, V)
374
+ return out.transpose(1, 2).contiguous().view(B, S, D)
375
+
376
+ hidden_dim = 1024
377
+ Q = torch.randn(8, 1024, hidden_dim, dtype=torch.float32, device='cuda')
378
+ K = torch.randn(8, 1024, hidden_dim, dtype=torch.float32, device='cuda')
379
+ V = torch.randn(8, 1024, hidden_dim, dtype=torch.float32, device='cuda')
380
+ out = self_attention(Q, K, V)
381
+ torch.cuda.synchronize()""",
382
+
383
+ "Conv2D ResNet Block": """import torch
384
+ import torch.nn as nn
385
+
386
+ def conv2d_forward(x, conv):
387
+ # Conv2D: batch=16, in_channels=256, out_channels=512
388
+ # Input: (16, 256, 56, 56), Kernel: 3x3
389
+ return conv(x)
390
+
391
+ conv = nn.Conv2d(256, 512, kernel_size=3, padding=1).to('cuda')
392
+ x = torch.randn(16, 256, 56, 56, dtype=torch.float32, device='cuda')
393
+ out = conv2d_forward(x, conv)
394
+ torch.cuda.synchronize()""",
395
+
396
+ "Transformer Block": """import torch
397
+ import torch.nn as nn
398
+
399
+ class TransformerBlock(nn.Module):
400
+ def __init__(self):
401
+ super().__init__()
402
+ self.attn = nn.MultiheadAttention(768, 12, batch_first=True)
403
+ self.ff = nn.Sequential(
404
+ nn.Linear(768, 3072),
405
+ nn.GELU(),
406
+ nn.Linear(3072, 768)
407
+ )
408
+ self.ln1 = nn.LayerNorm(768)
409
+ self.ln2 = nn.LayerNorm(768)
410
+
411
+ def forward(self, x):
412
+ attn_out, _ = self.attn(self.ln1(x), self.ln1(x), self.ln1(x))
413
+ x = x + attn_out
414
+ x = x + self.ff(self.ln2(x))
415
+ return x
416
+
417
+ block = TransformerBlock().to('cuda')
418
+ x = torch.randn(8, 512, 768, dtype=torch.float32, device='cuda')
419
+ out = block(x)
420
+ torch.cuda.synchronize()""",
421
+
422
+ "Elementwise GELU (100M elements)": """import torch
423
+
424
+ def elementwise_op(x):
425
+ # Elementwise gelu on tensor of size 100000000
426
+ return torch.nn.functional.gelu(x)
427
+
428
+ x = torch.randn(100000000, dtype=torch.float32, device='cuda')
429
+ out = elementwise_op(x)
430
+ torch.cuda.synchronize()""",
431
+
432
+ "LLM Linear Layer (fp16, vocab=50257)": """import torch
433
+ import torch.nn as nn
434
+
435
+ def linear_forward(x, linear):
436
+ # Linear layer: (32, 4096) -> (32, 50257)
437
+ return linear(x)
438
+
439
+ linear = nn.Linear(4096, 50257).to('cuda')
440
+ x = torch.randn(32, 4096, dtype=torch.float16, device='cuda')
441
+ out = linear_forward(x, linear)
442
+ torch.cuda.synchronize()""",
443
+ }
444
+
445
+
446
+ # ============================================================================
447
+ # GRADIO UI
448
+ # ============================================================================
449
+
450
+ gpu_choices = list(GPU_CATALOG.keys())
451
+ gpu_display_names = {k: v['name'] for k, v in GPU_CATALOG.items()}
452
+
453
+
454
+ def load_example(example_name):
455
+ return EXAMPLE_CODES.get(example_name, "")
456
+
457
+
458
+ with gr.Blocks(
459
+ title="GPU Runtime Predictor",
460
+ theme=gr.themes.Soft(),
461
+ ) as demo:
462
+ gr.Markdown("""
463
+ # ⚡ GPU Runtime Predictor
464
+
465
+ Predict how fast your PyTorch/CUDA code will run on different GPU hardware.
466
+ Paste your code, select GPUs from the catalog, and get instant runtime estimates.
467
+
468
+ **Model**: Ensemble of GBR + Random Forest + Neural Network | **R² = 0.993** | **12 GPUs** | **15 workload types**
469
+
470
+ ---
471
+ """)
472
+
473
+ with gr.Row():
474
+ with gr.Column(scale=3):
475
+ example_dropdown = gr.Dropdown(
476
+ choices=list(EXAMPLE_CODES.keys()),
477
+ label="📝 Load Example Code",
478
+ value=None,
479
+ interactive=True,
480
+ )
481
+
482
+ code_input = gr.Code(
483
+ label="Your PyTorch/CUDA Code",
484
+ language="python",
485
+ lines=20,
486
+ value=EXAMPLE_CODES["Matrix Multiplication (2048x2048)"],
487
+ )
488
+
489
+ with gr.Column(scale=2):
490
+ gpu_selector = gr.CheckboxGroup(
491
+ choices=[(gpu_display_names[k], k) for k in gpu_choices],
492
+ value=list(GPU_CATALOG.keys()),
493
+ label="🖥️ Select GPUs to Compare",
494
+ )
495
+
496
+ model_selector = gr.Radio(
497
+ choices=["Ensemble", "GBR", "Random Forest", "Neural Net"],
498
+ value="Ensemble",
499
+ label="🤖 Prediction Model",
500
+ )
501
+
502
+ predict_btn = gr.Button("⚡ Predict Runtime", variant="primary", size="lg")
503
+
504
+ gr.Markdown("---")
505
+
506
+ with gr.Row():
507
+ with gr.Column():
508
+ summary_output = gr.Markdown(label="Summary")
509
+
510
+ with gr.Row():
511
+ results_table = gr.DataFrame(
512
+ label="📊 Runtime Predictions (sorted fastest → slowest)",
513
+ interactive=False,
514
+ )
515
+
516
+ gr.Markdown("""
517
+ ---
518
+ ### ℹ️ How It Works
519
+
520
+ 1. **Code Analysis**: Extracts 48 features from your code (tensor dimensions, operation types, complexity indicators)
521
+ 2. **GPU Encoding**: Uses 12 hardware specs for each GPU (CUDA cores, memory bandwidth, TFLOPS, etc.)
522
+ 3. **ML Prediction**: Ensemble predicts `log(runtime_ms)` → converted back to milliseconds
523
+
524
+ **Powered by**: [Training Dataset](https://huggingface.co/datasets/RajBhope/gpu-runtime-prediction-dataset) | [Model](https://huggingface.co/RajBhope/gpu-runtime-predictor)
525
+
526
+ *Runtimes are estimates based on a roofline performance model. Actual runtimes may vary based on driver version, CUDA toolkit, memory state, and other factors.*
527
+ """)
528
+
529
+ # Event handlers
530
+ example_dropdown.change(
531
+ fn=load_example,
532
+ inputs=[example_dropdown],
533
+ outputs=[code_input],
534
+ )
535
+
536
+ predict_btn.click(
537
+ fn=predict_runtime,
538
+ inputs=[code_input, gpu_selector, model_selector],
539
+ outputs=[summary_output, results_table],
540
+ )
541
+
542
+ demo.launch()