tog commited on
Commit
3846650
·
1 Parent(s): 3d49912

Initial commit

Browse files
Files changed (7) hide show
  1. README.md +13 -1
  2. app.py +519 -0
  3. hardware_data.yaml +80 -0
  4. models.yaml +71 -0
  5. pyproject.toml +20 -0
  6. requirements.txt +5 -0
  7. uv.lock +0 -0
README.md CHANGED
@@ -10,4 +10,16 @@ pinned: false
10
  license: apache-2.0
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  license: apache-2.0
11
  ---
12
 
13
+ # LLM GPU Sizer (Gradio)
14
+
15
+ This Space estimates:
16
+ - VRAM for model weights + KV cache (worst-case per concurrency)
17
+ - number of GPUs required (with headroom)
18
+ - TTFT and ITL (anchor-based simulation)
19
+ - optionally reads TTFT/ITL from a running vLLM server `/metrics`
20
+
21
+ ## Local dev (uv)
22
+ ```bash
23
+ uv venv
24
+ uv pip install -r requirements.txt
25
+ uv run python app.py
app.py ADDED
@@ -0,0 +1,519 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import yaml
3
+ import math
4
+ import matplotlib.pyplot as plt
5
+ import os
6
+ import json
7
+ from huggingface_hub import hf_hub_download
8
+
9
+ # --- Configuration & Constants ---
10
+ HARDWARE_FILE = "hardware_data.yaml"
11
+ MODELS_FILE = "models.yaml"
12
+
13
+ # Physics Constants
14
+ COMPUTE_EFFICIENCY = 0.45
15
+ MEMORY_EFFICIENCY = 0.70
16
+ INTERCONNECT_EFFICIENCY = 0.65
17
+
18
+
19
+ # --- Data Loading ---
20
+ def load_hardware_data():
21
+ if not os.path.exists(HARDWARE_FILE):
22
+ return {}
23
+ with open(HARDWARE_FILE, "r") as f:
24
+ data = yaml.safe_load(f)
25
+ return {gpu["name"]: gpu for gpu in data["gpus"]}
26
+
27
+
28
+ def load_models_data():
29
+ if not os.path.exists(MODELS_FILE):
30
+ return {}
31
+ with open(MODELS_FILE, "r") as f:
32
+ data = yaml.safe_load(f) or {}
33
+ return data.get("models", {})
34
+
35
+
36
+ HARDWARE_DB = load_hardware_data()
37
+ MODELS_DB = load_models_data()
38
+
39
+
40
+ # --- Model Analysis ---
41
+ class ModelAnalyzer:
42
+ def __init__(self, repo_id, hf_token=None):
43
+ self.repo_id = repo_id
44
+ self.config = {}
45
+ self.error = None
46
+
47
+ if repo_id in MODELS_DB:
48
+ self.config = MODELS_DB[repo_id]
49
+ else:
50
+ try:
51
+ token = hf_token.strip() if hf_token else None
52
+ config_path = hf_hub_download(
53
+ repo_id=repo_id, filename="config.json", token=token
54
+ )
55
+ with open(config_path, "r") as f:
56
+ self.config = json.load(f)
57
+ except Exception as e:
58
+ self.error = f"Failed to fetch model: {str(e)}"
59
+ return
60
+
61
+ try:
62
+ self.hidden_size = self.config.get("hidden_size", 4096)
63
+ self.num_layers = self.config.get("num_hidden_layers", 32)
64
+ self.num_heads = self.config.get("num_attention_heads", 32)
65
+ self.num_kv_heads = self.config.get("num_key_value_heads", self.num_heads)
66
+ self.vocab_size = self.config.get("vocab_size", 32000)
67
+ self.max_context = self.config.get("max_position_embeddings", 4096)
68
+ self.intermediate_size = self.config.get(
69
+ "intermediate_size", self.hidden_size * 4
70
+ )
71
+
72
+ self.is_moe = False
73
+ self.num_experts = 1
74
+ self.active_experts = 1
75
+
76
+ if "num_local_experts" in self.config:
77
+ self.is_moe = True
78
+ self.num_experts = self.config["num_local_experts"]
79
+ self.active_experts = self.config.get("num_experts_per_tok", 2)
80
+ elif "notes" in self.config and "moe" in self.config["notes"]:
81
+ moe_cfg = self.config["notes"]["moe"]
82
+ self.is_moe = True
83
+ self.num_experts = moe_cfg.get("num_local_experts", 8)
84
+ self.active_experts = moe_cfg.get("num_experts_per_tok", 2)
85
+
86
+ self.calculate_params()
87
+ except Exception as e:
88
+ self.error = f"Error parsing config: {str(e)}"
89
+
90
+ def calculate_params(self):
91
+ self.params_embed = self.vocab_size * self.hidden_size
92
+ head_dim = self.hidden_size // self.num_heads
93
+ kv_dim = head_dim * self.num_kv_heads
94
+
95
+ self.params_attn = (
96
+ (self.hidden_size * self.hidden_size)
97
+ + (self.hidden_size * kv_dim)
98
+ + (self.hidden_size * kv_dim)
99
+ + (self.hidden_size * self.hidden_size)
100
+ )
101
+
102
+ dense_mlp = 3 * self.hidden_size * self.intermediate_size
103
+
104
+ if self.is_moe:
105
+ self.params_mlp_total = dense_mlp * self.num_experts
106
+ self.params_mlp_active = dense_mlp * self.active_experts
107
+ else:
108
+ self.params_mlp_total = dense_mlp
109
+ self.params_mlp_active = dense_mlp
110
+
111
+ self.params_norm = 2 * self.hidden_size
112
+ self.params_layer_total = (
113
+ self.params_attn + self.params_mlp_total + self.params_norm
114
+ )
115
+ self.params_layer_active = (
116
+ self.params_attn + self.params_mlp_active + self.params_norm
117
+ )
118
+
119
+ self.total_params = self.params_embed + (
120
+ self.num_layers * self.params_layer_total
121
+ )
122
+ self.active_params = self.params_embed + (
123
+ self.num_layers * self.params_layer_active
124
+ )
125
+
126
+
127
+ # --- Calculation Engine ---
128
+ def calculate_dimensioning(
129
+ model_name_or_repo,
130
+ hf_token,
131
+ gpu_name,
132
+ connectivity_type,
133
+ concurrent_users,
134
+ context_in,
135
+ context_out,
136
+ quantization,
137
+ ):
138
+ analyzer = ModelAnalyzer(model_name_or_repo, hf_token)
139
+ if analyzer.error:
140
+ return error_result(analyzer.error)
141
+
142
+ if gpu_name not in HARDWARE_DB:
143
+ return error_result(f"GPU '{gpu_name}' not found in database.")
144
+
145
+ gpu_spec = HARDWARE_DB[gpu_name]
146
+
147
+ # --- Robust Bandwidth Lookup ---
148
+ nvlink_bw = gpu_spec.get("interconnect_bw_gb_s", 0)
149
+ pcie_bw = gpu_spec.get("pcie_bw_gb_s", 64)
150
+
151
+ if connectivity_type == "NVLink":
152
+ interconnect_bw = nvlink_bw
153
+ if interconnect_bw == 0:
154
+ return error_result(f"Error: {gpu_name} does not support NVLink.")
155
+ elif connectivity_type == "PCIe / Standard":
156
+ interconnect_bw = pcie_bw
157
+ else: # Auto
158
+ interconnect_bw = nvlink_bw if nvlink_bw > 0 else pcie_bw
159
+
160
+ interconnect_bw_effective = interconnect_bw * INTERCONNECT_EFFICIENCY * 1e9
161
+
162
+ # --- Precision ---
163
+ fp4_supported = gpu_spec.get("fp4_supported", False)
164
+
165
+ if quantization == "FP16/BF16":
166
+ bytes_per_param = 2
167
+ elif quantization == "INT8":
168
+ bytes_per_param = 1
169
+ elif quantization == "FP4":
170
+ if not fp4_supported:
171
+ return error_result(f"Error: {gpu_name} does not support FP4.")
172
+ bytes_per_param = 0.5
173
+ else:
174
+ bytes_per_param = 2
175
+
176
+ # --- Memory Calculations ---
177
+ mem_weights = analyzer.total_params * bytes_per_param
178
+
179
+ head_dim = analyzer.hidden_size // analyzer.num_heads
180
+ total_tokens = context_in + context_out
181
+ # KV Cache: 2 (K+V) * layers * kv_heads * head_dim * tokens * batch * bytes(2 for FP16)
182
+ mem_kv = (
183
+ 2
184
+ * analyzer.num_layers
185
+ * analyzer.num_kv_heads
186
+ * head_dim
187
+ * total_tokens
188
+ * concurrent_users
189
+ * 2
190
+ )
191
+
192
+ # Overhead: Reverted to simple 20% rule
193
+ mem_overhead = mem_weights * 0.20
194
+
195
+ total_mem_required = mem_weights + mem_kv + mem_overhead
196
+ gpu_mem_capacity = gpu_spec["memory_gb"] * (1024**3)
197
+
198
+ num_gpus = math.ceil(total_mem_required / gpu_mem_capacity)
199
+
200
+ # --- Latency & Physics ---
201
+ compute_mode = "fp16_tflops_dense"
202
+ total_compute_flops = (
203
+ gpu_spec.get(compute_mode, 100) * 1e12 * num_gpus * COMPUTE_EFFICIENCY
204
+ )
205
+ if quantization == "FP4":
206
+ total_compute_flops *= 2.5
207
+
208
+ total_mem_bw = (
209
+ gpu_spec.get("bandwidth_gb_s", 1000) * 1e9 * num_gpus * MEMORY_EFFICIENCY
210
+ )
211
+
212
+ # TTFT (Prefill)
213
+ prefill_ops = 2 * analyzer.active_params * context_in * concurrent_users
214
+ time_compute_prefill = prefill_ops / total_compute_flops
215
+ # Move weights + write KV
216
+ time_mem_prefill = (
217
+ mem_weights + (mem_kv * (context_in / total_tokens))
218
+ ) / total_mem_bw
219
+ ttft = max(time_compute_prefill, time_mem_prefill) + (0.05 * num_gpus)
220
+
221
+ # TPOT (Decode)
222
+ gen_ops = 2 * analyzer.active_params * concurrent_users
223
+ t_compute = gen_ops / total_compute_flops
224
+
225
+ # Load all weights + active KV
226
+ bytes_moved = mem_weights + mem_kv
227
+ t_memory = bytes_moved / total_mem_bw
228
+
229
+ # Comm (AllReduce)
230
+ if num_gpus > 1:
231
+ comm_data_per_layer = (
232
+ 2 * analyzer.hidden_size * concurrent_users * bytes_per_param
233
+ )
234
+ total_comm_data = comm_data_per_layer * analyzer.num_layers
235
+ t_comm = total_comm_data / interconnect_bw_effective
236
+ else:
237
+ t_comm = 0
238
+
239
+ itl = max(t_compute, t_memory) + t_comm
240
+
241
+ # --- Result Formatting ---
242
+ server_name = gpu_spec.get("recommended_server", "Contact Lenovo Support")
243
+ if num_gpus > 8:
244
+ server_name += " (Requires Multi-Node Clustering)"
245
+
246
+ warnings = []
247
+ if interconnect_bw < 100 and num_gpus > 1:
248
+ warnings.append(
249
+ "Warning: PCIe Bottleneck - High latency expected without NVLink."
250
+ )
251
+ if itl > 0.150:
252
+ warnings.append(
253
+ f"Warning: High Latency - ITL is {itl * 1000:.0f}ms (exceeds 150ms threshold)."
254
+ )
255
+ if analyzer.is_moe:
256
+ warnings.append(
257
+ f"Info: MoE Model - Using active params {analyzer.active_params / 1e9:.1f}B for compute estimates."
258
+ )
259
+
260
+ # Chart (Per GPU)
261
+ fig = create_mem_chart_per_gpu(
262
+ mem_weights, mem_kv, mem_overhead, gpu_mem_capacity, num_gpus
263
+ )
264
+
265
+ # Textual memory breakdown for accessibility (WCAG 1.1.1 - Text Alternatives)
266
+ w_per_gb = (mem_weights / num_gpus) / (1024**3)
267
+ k_per_gb = (mem_kv / num_gpus) / (1024**3)
268
+ o_per_gb = (mem_overhead / num_gpus) / (1024**3)
269
+ cap_gb = gpu_mem_capacity / (1024**3)
270
+ used_gb = w_per_gb + k_per_gb + o_per_gb
271
+ free_gb = max(0, cap_gb - used_gb)
272
+ total_used_pct = (used_gb / cap_gb * 100) if cap_gb > 0 else 0
273
+
274
+ mem_text_alt = (
275
+ f"Per-GPU Memory Breakdown: Weights {w_per_gb:.1f} GB ({w_per_gb / cap_gb * 100:.1f}%), "
276
+ f"KV Cache {k_per_gb:.1f} GB ({k_per_gb / cap_gb * 100:.1f}%), "
277
+ f"Overhead {o_per_gb:.1f} GB ({o_per_gb / cap_gb * 100:.1f}%), "
278
+ f"Free {free_gb:.1f} GB ({free_gb / cap_gb * 100:.1f}%). "
279
+ f"Total used: {used_gb:.1f} GB of {cap_gb:.0f} GB ({total_used_pct:.1f}%)."
280
+ )
281
+
282
+ return (
283
+ f"{analyzer.total_params / 1e9:.1f}B",
284
+ f"{total_mem_required / (1024**3):.1f} GB",
285
+ num_gpus,
286
+ f"{ttft * 1000:.0f} ms",
287
+ f"{itl * 1000:.0f} ms",
288
+ server_name,
289
+ "\n".join(warnings) if warnings else "No warnings.",
290
+ fig,
291
+ mem_text_alt,
292
+ )
293
+
294
+
295
+ def create_mem_chart_per_gpu(weights, kv, overhead, single_gpu_cap, num_gpus):
296
+ # Normalize to Per-GPU view
297
+ w_per = (weights / num_gpus) / (1024**3)
298
+ k_per = (kv / num_gpus) / (1024**3)
299
+ o_per = (overhead / num_gpus) / (1024**3)
300
+ cap_gb = single_gpu_cap / (1024**3)
301
+
302
+ used = w_per + k_per + o_per
303
+ free = max(0, cap_gb - used)
304
+
305
+ # WCAG AA compliant colors with high contrast
306
+ # Using colors that work well with both light and dark backgrounds
307
+ labels = ["Weights", "KV Cache", "Overhead", "Free (Per GPU)"]
308
+ sizes = [w_per, k_per, o_per, free]
309
+ # High contrast colors: blue, purple, orange, gray
310
+ colors = ["#2563eb", "#7c3aed", "#ea580c", "#6b7280"]
311
+
312
+ fig, ax = plt.subplots(figsize=(6, 6))
313
+
314
+ # Enhanced labels with both percentage and GB values for clarity
315
+ def make_autopct(values):
316
+ def my_autopct(pct):
317
+ total = sum(values)
318
+ val = pct * total / 100.0
319
+ return f"{pct:.1f}%\n({val:.1f} GB)" if val > 0.1 else ""
320
+
321
+ return my_autopct
322
+
323
+ wedges, texts, autotexts = ax.pie(
324
+ sizes,
325
+ labels=labels,
326
+ autopct=make_autopct(sizes),
327
+ colors=colors,
328
+ startangle=90,
329
+ textprops={"fontsize": 10, "weight": "bold"},
330
+ )
331
+
332
+ # Ensure text is readable (WCAG contrast)
333
+ for autotext in autotexts:
334
+ autotext.set_color("white")
335
+ autotext.set_weight("bold")
336
+
337
+ ax.set_title(
338
+ f"Per-GPU Memory Usage (Capacity: {cap_gb:.0f} GB)",
339
+ fontsize=12,
340
+ fontweight="bold",
341
+ pad=20,
342
+ )
343
+ ax.axis("equal")
344
+ plt.tight_layout()
345
+ plt.close(fig)
346
+ return fig
347
+
348
+
349
+ def error_result(msg):
350
+ empty_fig = plt.figure()
351
+ plt.close(empty_fig)
352
+ return (
353
+ "Error",
354
+ "Error",
355
+ 0,
356
+ "-",
357
+ "-",
358
+ "Check Inputs",
359
+ f"Error: {msg}",
360
+ empty_fig,
361
+ "Memory breakdown not available due to calculation error.",
362
+ )
363
+
364
+
365
+ # --- UI Setup ---
366
+ with gr.Blocks(title="GPUguesstimator", theme=gr.themes.Soft()) as demo:
367
+ gr.Markdown(
368
+ """
369
+ # GPUguesstimator
370
+
371
+ Physics-based sizing tool for calculating VRAM requirements, compute capacity, and interconnect bottlenecks for Large Language Model inference.
372
+ """
373
+ )
374
+
375
+ with gr.Row():
376
+ with gr.Column():
377
+ gr.Markdown("## 1. Workload Configuration")
378
+ model_keys = list(MODELS_DB.keys())
379
+ model_dd = gr.Dropdown(
380
+ choices=model_keys + ["Custom"],
381
+ value=model_keys[0] if model_keys else "Custom",
382
+ label="Model Preset",
383
+ info="Select a preset model or choose Custom to enter a HuggingFace repository ID",
384
+ )
385
+ repo_input = gr.Textbox(
386
+ label="HuggingFace Repository ID",
387
+ value=model_keys[0] if model_keys else "",
388
+ placeholder="e.g., meta-llama/Meta-Llama-3-70B-Instruct",
389
+ info="Enter the HuggingFace model repository identifier",
390
+ )
391
+ hf_token = gr.Textbox(
392
+ label="HuggingFace Token (Optional)",
393
+ type="password",
394
+ info="Required for accessing gated models. Leave empty for public models.",
395
+ )
396
+
397
+ users = gr.Slider(
398
+ 1,
399
+ 500,
400
+ value=10,
401
+ step=1,
402
+ label="Concurrent Users",
403
+ info="Number of simultaneous inference requests to handle",
404
+ )
405
+ ctx_in = gr.Slider(
406
+ 128,
407
+ 128000,
408
+ value=2048,
409
+ step=128,
410
+ label="Input Context Length (Tokens)",
411
+ info="Maximum number of input tokens per request",
412
+ )
413
+ ctx_out = gr.Slider(
414
+ 128,
415
+ 16384,
416
+ value=512,
417
+ step=128,
418
+ label="Output Tokens (Generation Length)",
419
+ info="Maximum number of tokens to generate per request",
420
+ )
421
+
422
+ gr.Markdown("## 2. Infrastructure Configuration")
423
+ gpu_keys = list(HARDWARE_DB.keys())
424
+ default_gpu = gpu_keys[0] if gpu_keys else "NVIDIA H100-80GB SXM5"
425
+
426
+ gpu_select = gr.Dropdown(
427
+ choices=gpu_keys,
428
+ value=default_gpu,
429
+ label="GPU Model",
430
+ info="Select the GPU model for inference",
431
+ )
432
+ conn_select = gr.Dropdown(
433
+ choices=["Auto", "NVLink", "PCIe / Standard"],
434
+ value="Auto",
435
+ label="Interconnect Type",
436
+ info="Auto uses GPU default, NVLink for high-bandwidth, PCIe for standard connections",
437
+ )
438
+ quant_select = gr.Dropdown(
439
+ choices=["FP16/BF16", "INT8", "FP4"],
440
+ value="FP16/BF16",
441
+ label="Quantization Precision",
442
+ info="Model weight precision: FP16/BF16 (standard), INT8 (8-bit), FP4 (4-bit, requires Blackwell)",
443
+ )
444
+
445
+ btn = gr.Button("Calculate Sizing", variant="primary", size="lg")
446
+
447
+ with gr.Column():
448
+ gr.Markdown("## 3. Sizing Results")
449
+ with gr.Group():
450
+ res_gpus = gr.Number(
451
+ label="GPUs Required",
452
+ precision=0,
453
+ info="Minimum number of GPUs needed to fit the model and workload",
454
+ )
455
+ res_server = gr.Textbox(
456
+ label="Recommended Lenovo Server",
457
+ info="Suggested Lenovo server configuration",
458
+ )
459
+ res_vram = gr.Textbox(
460
+ label="Total VRAM Required",
461
+ info="Total video memory needed across all GPUs",
462
+ )
463
+ res_params = gr.Textbox(
464
+ label="Model Parameters",
465
+ info="Total number of model parameters in billions",
466
+ )
467
+ with gr.Row():
468
+ res_ttft = gr.Textbox(
469
+ label="TTFT - Time to First Token",
470
+ info="Prefill latency: time to process input and generate first token",
471
+ )
472
+ res_itl = gr.Textbox(
473
+ label="ITL - Inter-Token Latency",
474
+ info="Generation speed: time between each generated token",
475
+ )
476
+ res_warnings = gr.Textbox(
477
+ label="Analysis Notes and Warnings",
478
+ lines=4,
479
+ info="Important notes, warnings, and recommendations about the configuration",
480
+ )
481
+ plot_output = gr.Plot(label="Per-GPU Memory Breakdown Chart")
482
+ mem_text_alt = gr.Textbox(
483
+ label="Memory Breakdown (Text Description)",
484
+ info="Textual description of memory allocation for screen readers and accessibility",
485
+ lines=2,
486
+ )
487
+
488
+ def update_repo(choice):
489
+ return choice if choice != "Custom" else ""
490
+
491
+ model_dd.change(update_repo, model_dd, repo_input)
492
+
493
+ btn.click(
494
+ calculate_dimensioning,
495
+ inputs=[
496
+ repo_input,
497
+ hf_token,
498
+ gpu_select,
499
+ conn_select,
500
+ users,
501
+ ctx_in,
502
+ ctx_out,
503
+ quant_select,
504
+ ],
505
+ outputs=[
506
+ res_params,
507
+ res_vram,
508
+ res_gpus,
509
+ res_ttft,
510
+ res_itl,
511
+ res_server,
512
+ res_warnings,
513
+ plot_output,
514
+ mem_text_alt,
515
+ ],
516
+ )
517
+
518
+ if __name__ == "__main__":
519
+ demo.launch()
hardware_data.yaml ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gpus:
2
+ - name: "NVIDIA A100-80GB SXM"
3
+ memory_gb: 80
4
+ bandwidth_gb_s: 2039
5
+ fp16_tflops_dense: 312
6
+ interconnect_bw_gb_s: 600
7
+ pcie_bw_gb_s: 64
8
+ fp4_supported: false
9
+ recommended_server: "Lenovo ThinkSystem SR670 V2 / SR675 V3"
10
+ cost_tier: "High"
11
+
12
+ - name: "NVIDIA A100-80GB PCIe"
13
+ memory_gb: 80
14
+ bandwidth_gb_s: 1935
15
+ fp16_tflops_dense: 312
16
+ interconnect_bw_gb_s: 0
17
+ pcie_bw_gb_s: 64
18
+ fp4_supported: false
19
+ recommended_server: "Lenovo ThinkSystem SR650 V3 / SR670 V2"
20
+ cost_tier: "Medium-High"
21
+
22
+ - name: "NVIDIA H100-80GB SXM5"
23
+ memory_gb: 80
24
+ bandwidth_gb_s: 3350
25
+ fp16_tflops_dense: 989
26
+ interconnect_bw_gb_s: 900
27
+ pcie_bw_gb_s: 128
28
+ fp4_supported: true
29
+ recommended_server: "Lenovo ThinkSystem SR675 V3 / SR680a V3"
30
+ cost_tier: "Premium"
31
+
32
+ - name: "NVIDIA H100 NVL (PCIe Pair)"
33
+ memory_gb: 94
34
+ bandwidth_gb_s: 3900
35
+ fp16_tflops_dense: 835
36
+ interconnect_bw_gb_s: 600
37
+ pcie_bw_gb_s: 128
38
+ fp4_supported: true
39
+ recommended_server: "Lenovo ThinkSystem SR675 V3"
40
+ cost_tier: "Premium"
41
+
42
+ - name: "NVIDIA H200-141GB SXM"
43
+ memory_gb: 141
44
+ bandwidth_gb_s: 4800
45
+ fp16_tflops_dense: 989
46
+ interconnect_bw_gb_s: 900
47
+ pcie_bw_gb_s: 128
48
+ fp4_supported: true
49
+ recommended_server: "Lenovo ThinkSystem SR675 V3 / SR680a V3"
50
+ cost_tier: "Premium+"
51
+
52
+ - name: "NVIDIA RTX 6000 Ada"
53
+ memory_gb: 48
54
+ bandwidth_gb_s: 960
55
+ fp16_tflops_dense: 91
56
+ interconnect_bw_gb_s: 0
57
+ pcie_bw_gb_s: 64
58
+ fp4_supported: false
59
+ recommended_server: "Lenovo ThinkStation PX / ThinkSystem SR650 V3"
60
+ cost_tier: "Entry-Ent"
61
+
62
+ - name: "NVIDIA B200 (Blackwell)"
63
+ memory_gb: 192
64
+ bandwidth_gb_s: 8000
65
+ fp16_tflops_dense: 2250
66
+ interconnect_bw_gb_s: 1800
67
+ pcie_bw_gb_s: 128
68
+ fp4_supported: true
69
+ recommended_server: "Lenovo ThinkSystem SR685a V3"
70
+ cost_tier: "Next-Gen"
71
+
72
+ - name: "NVIDIA GB200 (Grace Blackwell Superchip)"
73
+ memory_gb: 384
74
+ bandwidth_gb_s: 16000
75
+ fp16_tflops_dense: 5000
76
+ interconnect_bw_gb_s: 3600
77
+ pcie_bw_gb_s: 256
78
+ fp4_supported: true
79
+ recommended_server: "Lenovo ThinkSystem SR780a V3 (Liquid Cooled)"
80
+ cost_tier: "Hyperscale"
models.yaml ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ models:
2
+ "meta-llama/Meta-Llama-3-70B-Instruct":
3
+ hidden_size: 8192
4
+ num_hidden_layers: 80
5
+ num_attention_heads: 64
6
+ num_key_value_heads: 8
7
+ vocab_size: 128256
8
+ max_position_embeddings: 8192
9
+ intermediate_size: 28672
10
+
11
+ "meta-llama/Meta-Llama-3-8B-Instruct":
12
+ hidden_size: 4096
13
+ num_hidden_layers: 32
14
+ num_attention_heads: 32
15
+ num_key_value_heads: 8
16
+ vocab_size: 128256
17
+ max_position_embeddings: 8192
18
+ intermediate_size: 14336
19
+
20
+ "mistralai/Mixtral-8x7B-Instruct-v0.1":
21
+ hidden_size: 4096
22
+ num_hidden_layers: 32
23
+ num_attention_heads: 32
24
+ num_key_value_heads: 8
25
+ vocab_size: 32000
26
+ max_position_embeddings: 32768
27
+ intermediate_size: 14336
28
+ notes:
29
+ moe:
30
+ num_local_experts: 8
31
+ num_experts_per_tok: 2
32
+
33
+ "mistralai/Mistral-7B-Instruct-v0.3":
34
+ hidden_size: 4096
35
+ num_hidden_layers: 32
36
+ num_attention_heads: 32
37
+ num_key_value_heads: 8
38
+ vocab_size: 32768
39
+ max_position_embeddings: 32768
40
+ intermediate_size: 14336
41
+
42
+ "google/gemma-7b":
43
+ hidden_size: 3072
44
+ num_hidden_layers: 28
45
+ num_attention_heads: 16
46
+ num_key_value_heads: 16
47
+ vocab_size: 256000
48
+ max_position_embeddings: 8192
49
+ intermediate_size: 24576
50
+
51
+ "openai/gpt-oss-20b":
52
+ hidden_size: 2880
53
+ num_hidden_layers: 24
54
+ num_attention_heads: 64
55
+ num_key_value_heads: 8
56
+ vocab_size: 201088
57
+ max_position_embeddings: 131072
58
+ intermediate_size: 2880
59
+ num_local_experts: 32
60
+ num_experts_per_tok: 4
61
+
62
+ "openai/gpt-oss-120b":
63
+ hidden_size: 2880
64
+ num_hidden_layers: 36
65
+ num_attention_heads: 64
66
+ num_key_value_heads: 8
67
+ vocab_size: 201088
68
+ max_position_embeddings: 131072
69
+ intermediate_size: 2880
70
+ num_local_experts: 128
71
+ num_experts_per_tok: 4
pyproject.toml ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [project]
2
+ name = "gpuguesstimator"
3
+ version = "0.1.0"
4
+ description = "GPUguesstimator — a KV-cache–aware GPU sizing simulator for LLM serving (vLLM-friendly)"
5
+ readme = "README.md"
6
+ requires-python = ">=3.10"
7
+ dependencies = [
8
+ "gradio>=4.0.0",
9
+ "pyyaml>=6.0.1",
10
+ "huggingface_hub>=0.22.0",
11
+ "requests>=2.31.0",
12
+ "matplotlib>=3.7.0",
13
+ ]
14
+
15
+ [project.optional-dependencies]
16
+ dev = [
17
+ "ruff>=0.5.0",
18
+ ]
19
+
20
+ [tool.uv]
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio>=4.0.0
2
+ pyyaml>=6.0.1
3
+ huggingface_hub>=0.22.0
4
+ requests>=2.31.0
5
+ matplotlib>=3.7.0
uv.lock ADDED
The diff for this file is too large to render. See raw diff