tog commited on
Commit
1968f0b
·
1 Parent(s): 83ef6b4

feat: add RAG support, improve parameter calculation, and enhance UI

Browse files

- Add RAG pipeline support with embedding and reranker model selection (WIP)
- Improve parameter calculation using HuggingFace API safetensors metadata
- Enhance MoE detection and active parameter calculation
- Add configurable GPU memory overhead percentage slider
- Improve network/interconnect logic with PCIe bottleneck handling
- Add Qwen3-VL models (235B and 30B variants) to models.yaml
- Reorder hardware_data.yaml by cost_tier for better UX
- Remove legend from donut chart (hover tooltips provide info)
- Update memory breakdown to include RAG models category
- Fix text_config handling for vision-language models

Files changed (3) hide show
  1. app.py +312 -123
  2. hardware_data.yaml +42 -22
  3. models.yaml +52 -0
app.py CHANGED
@@ -5,7 +5,7 @@ import matplotlib.pyplot as plt
5
  import plotly.graph_objects as go
6
  import os
7
  import json
8
- from huggingface_hub import hf_hub_download
9
 
10
  # --- Configuration & Constants ---
11
  HARDWARE_FILE = "hardware_data.yaml"
@@ -16,6 +16,27 @@ COMPUTE_EFFICIENCY = 0.45
16
  MEMORY_EFFICIENCY = 0.70
17
  INTERCONNECT_EFFICIENCY = 0.65
18
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
  # --- Data Loading ---
21
  def load_hardware_data():
@@ -44,7 +65,18 @@ class ModelAnalyzer:
44
  self.repo_id = repo_id
45
  self.config = {}
46
  self.error = None
 
47
 
 
 
 
 
 
 
 
 
 
 
48
  if repo_id in MODELS_DB:
49
  self.config = MODELS_DB[repo_id]
50
  else:
@@ -60,69 +92,108 @@ class ModelAnalyzer:
60
  return
61
 
62
  try:
63
- self.hidden_size = self.config.get("hidden_size", 4096)
64
- self.num_layers = self.config.get("num_hidden_layers", 32)
65
- self.num_heads = self.config.get("num_attention_heads", 32)
66
- self.num_kv_heads = self.config.get("num_key_value_heads", self.num_heads)
67
- self.vocab_size = self.config.get("vocab_size", 32000)
68
- self.max_context = self.config.get("max_position_embeddings", 4096)
69
- self.intermediate_size = self.config.get(
 
 
 
 
 
 
 
 
70
  "intermediate_size", self.hidden_size * 4
71
  )
72
 
 
73
  self.is_moe = False
74
  self.num_experts = 1
75
  self.active_experts = 1
76
 
77
- if "num_local_experts" in self.config:
78
- self.is_moe = True
79
- self.num_experts = self.config["num_local_experts"]
80
- self.active_experts = self.config.get("num_experts_per_tok", 2)
81
- elif "notes" in self.config and "moe" in self.config["notes"]:
82
- moe_cfg = self.config["notes"]["moe"]
83
- self.is_moe = True
84
- self.num_experts = moe_cfg.get("num_local_experts", 8)
85
- self.active_experts = moe_cfg.get("num_experts_per_tok", 2)
86
 
 
87
  self.calculate_params()
 
88
  except Exception as e:
89
  self.error = f"Error parsing config: {str(e)}"
90
 
91
- def calculate_params(self):
92
- self.params_embed = self.vocab_size * self.hidden_size
93
- head_dim = self.hidden_size // self.num_heads
94
- kv_dim = head_dim * self.num_kv_heads
95
-
96
- self.params_attn = (
97
- (self.hidden_size * self.hidden_size)
98
- + (self.hidden_size * kv_dim)
99
- + (self.hidden_size * kv_dim)
100
- + (self.hidden_size * self.hidden_size)
101
- )
102
 
103
- dense_mlp = 3 * self.hidden_size * self.intermediate_size
 
 
 
 
 
104
 
105
  if self.is_moe:
106
- self.params_mlp_total = dense_mlp * self.num_experts
107
- self.params_mlp_active = dense_mlp * self.active_experts
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
  else:
109
- self.params_mlp_total = dense_mlp
110
- self.params_mlp_active = dense_mlp
 
 
 
 
 
 
 
 
 
111
 
112
- self.params_norm = 2 * self.hidden_size
113
- self.params_layer_total = (
114
- self.params_attn + self.params_mlp_total + self.params_norm
115
- )
116
- self.params_layer_active = (
117
- self.params_attn + self.params_mlp_active + self.params_norm
118
- )
119
 
120
- self.total_params = self.params_embed + (
121
- self.num_layers * self.params_layer_total
122
- )
123
- self.active_params = self.params_embed + (
124
- self.num_layers * self.params_layer_active
125
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
126
 
127
 
128
  # --- Calculation Engine ---
@@ -135,6 +206,10 @@ def calculate_dimensioning(
135
  context_in,
136
  context_out,
137
  quantization,
 
 
 
 
138
  ):
139
  analyzer = ModelAnalyzer(model_name_or_repo, hf_token)
140
  if analyzer.error:
@@ -145,20 +220,24 @@ def calculate_dimensioning(
145
 
146
  gpu_spec = HARDWARE_DB[gpu_name]
147
 
148
- # --- Robust Bandwidth Lookup ---
149
  nvlink_bw = gpu_spec.get("interconnect_bw_gb_s", 0)
150
  pcie_bw = gpu_spec.get("pcie_bw_gb_s", 64)
 
151
 
152
  if connectivity_type == "NVLink":
153
- interconnect_bw = nvlink_bw
154
- if interconnect_bw == 0:
155
  return error_result(f"Error: {gpu_name} does not support NVLink.")
 
 
156
  elif connectivity_type == "PCIe / Standard":
157
- interconnect_bw = pcie_bw
 
158
  else: # Auto
159
- interconnect_bw = nvlink_bw if nvlink_bw > 0 else pcie_bw
160
-
161
- interconnect_bw_effective = interconnect_bw * INTERCONNECT_EFFICIENCY * 1e9
 
162
 
163
  # --- Precision ---
164
  fp4_supported = gpu_spec.get("fp4_supported", False)
@@ -174,70 +253,115 @@ def calculate_dimensioning(
174
  else:
175
  bytes_per_param = 2
176
 
177
- # --- Memory Calculations ---
 
 
178
  mem_weights = analyzer.total_params * bytes_per_param
179
 
 
 
 
 
 
 
 
 
 
 
180
  head_dim = analyzer.hidden_size // analyzer.num_heads
181
  total_tokens = context_in + context_out
182
- # KV Cache: 2 (K+V) * layers * kv_heads * head_dim * tokens * batch * bytes(2 for FP16)
183
- mem_kv = (
 
 
184
  2
185
  * analyzer.num_layers
186
  * analyzer.num_kv_heads
187
  * head_dim
188
  * total_tokens
189
- * concurrent_users
190
- * 2
191
  )
192
 
193
- # Overhead: Reverted to simple 20% rule
194
- mem_overhead = mem_weights * 0.20
195
 
196
- total_mem_required = mem_weights + mem_kv + mem_overhead
197
- gpu_mem_capacity = gpu_spec["memory_gb"] * (1024**3)
198
 
 
 
 
 
 
199
  num_gpus = math.ceil(total_mem_required / gpu_mem_capacity)
200
 
201
- # --- Latency & Physics ---
202
  compute_mode = "fp16_tflops_dense"
203
- total_compute_flops = (
204
- gpu_spec.get(compute_mode, 100) * 1e12 * num_gpus * COMPUTE_EFFICIENCY
205
  )
206
  if quantization == "FP4":
207
- total_compute_flops *= 2.5
208
 
209
- total_mem_bw = (
210
- gpu_spec.get("bandwidth_gb_s", 1000) * 1e9 * num_gpus * MEMORY_EFFICIENCY
211
  )
212
 
213
- # TTFT (Prefill)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
214
  prefill_ops = 2 * analyzer.active_params * context_in * concurrent_users
215
- time_compute_prefill = prefill_ops / total_compute_flops
216
- # Move weights + write KV
217
- time_mem_prefill = (
218
- mem_weights + (mem_kv * (context_in / total_tokens))
219
- ) / total_mem_bw
220
- ttft = max(time_compute_prefill, time_mem_prefill) + (0.05 * num_gpus)
221
-
222
- # TPOT (Decode)
223
- gen_ops = 2 * analyzer.active_params * concurrent_users
224
- t_compute = gen_ops / total_compute_flops
225
 
226
- # Load all weights + active KV
227
- bytes_moved = mem_weights + mem_kv
228
- t_memory = bytes_moved / total_mem_bw
229
 
230
- # Comm (AllReduce)
231
- if num_gpus > 1:
232
- comm_data_per_layer = (
233
- 2 * analyzer.hidden_size * concurrent_users * bytes_per_param
234
- )
235
- total_comm_data = comm_data_per_layer * analyzer.num_layers
236
- t_comm = total_comm_data / interconnect_bw_effective
237
- else:
238
- t_comm = 0
239
-
240
- itl = max(t_compute, t_memory) + t_comm
241
 
242
  # --- Result Formatting ---
243
  server_name = gpu_spec.get("recommended_server", "Contact Lenovo Support")
@@ -245,49 +369,66 @@ def calculate_dimensioning(
245
  server_name += " (Requires Multi-Node Clustering)"
246
 
247
  warnings = []
248
- if interconnect_bw < 100 and num_gpus > 1:
249
  warnings.append(
250
- "Warning: PCIe Bottleneck - High latency expected without NVLink."
251
  )
252
  if itl > 0.150:
253
  warnings.append(
254
- f"Warning: High Latency - ITL is {itl * 1000:.0f}ms (exceeds 150ms threshold)."
 
 
 
 
255
  )
256
  if analyzer.is_moe:
257
  warnings.append(
258
- f"Info: MoE Model - Using active params {analyzer.active_params / 1e9:.1f}B for compute estimates."
 
 
 
 
259
  )
260
 
261
  # Chart (Per GPU)
 
262
  fig = create_mem_chart_per_gpu(
263
- mem_weights, mem_kv, mem_overhead, gpu_mem_capacity, num_gpus
 
 
 
 
 
264
  )
265
 
266
  # Textual memory breakdown for accessibility (WCAG 1.1.1 - Text Alternatives)
267
  w_per_gb = (mem_weights / num_gpus) / (1024**3)
268
- k_per_gb = (mem_kv / num_gpus) / (1024**3)
269
- o_per_gb = (mem_overhead / num_gpus) / (1024**3)
 
270
  cap_gb = gpu_mem_capacity / (1024**3)
271
- used_gb = w_per_gb + k_per_gb + o_per_gb
272
  free_gb = max(0, cap_gb - used_gb)
273
  total_used_pct = (used_gb / cap_gb * 100) if cap_gb > 0 else 0
274
 
275
  # Calculate percentages for display
276
  w_pct = (w_per_gb / cap_gb * 100) if cap_gb > 0 else 0
277
- k_pct = (k_per_gb / cap_gb * 100) if cap_gb > 0 else 0
 
278
  o_pct = (o_per_gb / cap_gb * 100) if cap_gb > 0 else 0
279
  free_pct = (free_gb / cap_gb * 100) if cap_gb > 0 else 0
280
 
281
  mem_text_alt = (
282
  f"Per-GPU Memory Breakdown (Total Capacity: {cap_gb:.0f} GB):\n"
283
  f"• Weights: {w_per_gb:.1f} GB ({w_pct:.1f}%) - Model parameters stored in memory. Fixed size based on model architecture and quantization.\n"
284
- f"• KV Cache: {k_per_gb:.1f} GB ({k_pct:.1f}%) - Attention key-value cache for all tokens. Grows with number of concurrent users, input context length, and output tokens.\n"
285
- f"• Overhead: {o_per_gb:.1f} GB ({o_pct:.1f}%) - Activation buffers, CUDA context, and memory fragmentation. Typically 20% of weights size.\n"
 
286
  f"• Free: {free_gb:.1f} GB ({free_pct:.1f}%) - Available memory headroom for additional operations."
287
  )
288
 
289
  return (
290
- f"{analyzer.total_params / 1e9:.1f}B",
291
  f"{total_mem_required / (1024**3):.1f} GB",
292
  num_gpus,
293
  f"{ttft * 1000:.0f} ms",
@@ -299,44 +440,62 @@ def calculate_dimensioning(
299
  )
300
 
301
 
302
- def create_mem_chart_per_gpu(weights, kv, overhead, single_gpu_cap, num_gpus):
 
 
303
  # Normalize to Per-GPU view
304
  w_per = (weights / num_gpus) / (1024**3)
305
- k_per = (kv / num_gpus) / (1024**3)
 
306
  o_per = (overhead / num_gpus) / (1024**3)
307
  cap_gb = single_gpu_cap / (1024**3)
308
 
309
- used = w_per + k_per + o_per
310
  free = max(0, cap_gb - used)
311
 
312
  # Modern, accessible color palette (WCAG AA compliant)
313
- # Using a professional palette with good contrast
314
- labels = ["Weights", "KV Cache", "Overhead", "Free (Per GPU)"]
315
- values = [w_per, k_per, o_per, free]
 
 
 
 
 
316
 
317
- # Professional color palette: Blue, Orange, Green, Gray
318
- # High contrast and visually distinct
319
- colors = ["#4A90E2", "#F5A623", "#7ED321", "#BDC3C7"]
 
 
 
 
 
320
 
321
  # Calculate percentages for hover text
322
- total = sum(values)
323
- percentages = [(v / total * 100) if total > 0 else 0 for v in values]
 
 
 
324
 
325
  # Create hover text with detailed information
 
 
326
  hover_texts = [
327
- f"{labels[i]}<br>"
328
- f"Value: {values[i]:.1f} GB<br>"
329
  f"Percentage: {percentages[i]:.1f}%<br>"
330
  f"Capacity: {cap_gb:.0f} GB"
331
- for i in range(len(labels))
332
  ]
333
 
334
  # Create donut chart using plotly
335
  fig = go.Figure(
336
  data=[
337
  go.Pie(
338
- labels=labels,
339
- values=values,
340
  hole=0.5, # Creates the donut (hole in the middle)
341
  marker=dict(colors=colors, line=dict(color="#FFFFFF", width=2)),
342
  textinfo="label+percent",
@@ -355,8 +514,7 @@ def create_mem_chart_per_gpu(weights, kv, overhead, single_gpu_cap, num_gpus):
355
  "xanchor": "center",
356
  "font": {"size": 16, "family": "Arial, sans-serif"},
357
  },
358
- showlegend=True,
359
- legend=dict(orientation="v", yanchor="middle", y=0.5, x=1.15),
360
  font=dict(family="Arial, sans-serif", size=12),
361
  margin=dict(l=20, r=20, t=50, b=20),
362
  height=500,
@@ -461,6 +619,25 @@ with gr.Blocks(title="GPUguesstimator") as demo:
461
  info="Maximum number of tokens to generate per request",
462
  )
463
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
464
  gr.Markdown("## Infrastructure Configuration")
465
  gpu_keys = list(HARDWARE_DB.keys())
466
  default_gpu = gpu_keys[0] if gpu_keys else "NVIDIA H100-80GB SXM5"
@@ -483,6 +660,14 @@ with gr.Blocks(title="GPUguesstimator") as demo:
483
  label="Quantization Precision",
484
  info="Model weight precision: FP16/BF16 (standard), INT8 (8-bit), FP4 (4-bit, requires Blackwell)",
485
  )
 
 
 
 
 
 
 
 
486
 
487
  btn = gr.Button("Calculate Sizing", variant="primary", size="lg")
488
 
@@ -543,6 +728,10 @@ with gr.Blocks(title="GPUguesstimator") as demo:
543
  ctx_in,
544
  ctx_out,
545
  quant_select,
 
 
 
 
546
  ],
547
  outputs=[
548
  res_params,
 
5
  import plotly.graph_objects as go
6
  import os
7
  import json
8
+ from huggingface_hub import hf_hub_download, HfApi
9
 
10
  # --- Configuration & Constants ---
11
  HARDWARE_FILE = "hardware_data.yaml"
 
16
  MEMORY_EFFICIENCY = 0.70
17
  INTERCONNECT_EFFICIENCY = 0.65
18
 
19
+ # Defaults
20
+ ACTIVATION_MEMORY_BUFFER_GB = 0.5
21
+ DEFAULT_GPU_OVERHEAD_PCT = 20
22
+
23
+ # Embedding Models VRAM Est. (Weights + Runtime Buffer)
24
+ EMBEDDING_MODELS = {
25
+ "External/API (No Local VRAM)": 0.0,
26
+ "Mini (All-MiniLM-L6) ~0.2GB": 0.2,
27
+ "Standard (MPNet-Base/BGE-Base) ~0.6GB": 0.6,
28
+ "Large (BGE-M3/GTE-Large) ~2.5GB": 2.5,
29
+ "LLM-Based (E5-Mistral-7B) ~16GB": 16.0,
30
+ }
31
+
32
+ # Reranker Models VRAM Est. (Weights + Batch Processing Buffer)
33
+ RERANKER_MODELS = {
34
+ "None (Skip Reranking)": 0.0,
35
+ "Small (BGE-Reranker-Base) ~0.5GB": 0.5,
36
+ "Large (BGE-Reranker-Large) ~1.5GB": 1.5,
37
+ "LLM-Based (BGE-Reranker-v2-Gemma) ~10GB": 10.0,
38
+ }
39
+
40
 
41
  # --- Data Loading ---
42
  def load_hardware_data():
 
65
  self.repo_id = repo_id
66
  self.config = {}
67
  self.error = None
68
+ self.api = HfApi(token=hf_token.strip() if hf_token else None)
69
 
70
+ # 1. Try to get Model Info (Total Params) from API first
71
+ self.total_params_safetensors = None
72
+ try:
73
+ model_info = self.api.model_info(repo_id)
74
+ if hasattr(model_info, "safetensors") and model_info.safetensors and "total" in model_info.safetensors:
75
+ self.total_params_safetensors = model_info.safetensors["total"]
76
+ except Exception:
77
+ pass # Fallback to config parsing
78
+
79
+ # 2. Load Config
80
  if repo_id in MODELS_DB:
81
  self.config = MODELS_DB[repo_id]
82
  else:
 
92
  return
93
 
94
  try:
95
+ # Handle nested configs (common in multimodal)
96
+ if "text_config" in self.config:
97
+ self.llm_config = self.config["text_config"]
98
+ elif "llm_config" in self.config:
99
+ self.llm_config = self.config["llm_config"]
100
+ else:
101
+ self.llm_config = self.config
102
+
103
+ self.hidden_size = self.llm_config.get("hidden_size", 4096)
104
+ self.num_layers = self.llm_config.get("num_hidden_layers", 32)
105
+ self.num_heads = self.llm_config.get("num_attention_heads", 32)
106
+ self.num_kv_heads = self.llm_config.get("num_key_value_heads", self.num_heads)
107
+ self.vocab_size = self.llm_config.get("vocab_size", 32000)
108
+ self.max_context = self.llm_config.get("max_position_embeddings", 4096)
109
+ self.intermediate_size = self.llm_config.get(
110
  "intermediate_size", self.hidden_size * 4
111
  )
112
 
113
+ # MoE detection
114
  self.is_moe = False
115
  self.num_experts = 1
116
  self.active_experts = 1
117
 
118
+ # Check for MoE config patterns
119
+ self._detect_moe()
 
 
 
 
 
 
 
120
 
121
+ # Calculate Parameters
122
  self.calculate_params()
123
+
124
  except Exception as e:
125
  self.error = f"Error parsing config: {str(e)}"
126
 
127
+ def _detect_moe(self):
128
+ archs = self.config.get("architectures", [])
129
+ keys = set(self.config.keys()) | set(self.llm_config.keys())
 
 
 
 
 
 
 
 
130
 
131
+ if (
132
+ any("moe" in a.lower() for a in archs)
133
+ or any("moe" in k.lower() for k in keys)
134
+ or any("expert" in k.lower() for k in keys)
135
+ ):
136
+ self.is_moe = True
137
 
138
  if self.is_moe:
139
+ self.num_experts = (
140
+ self.llm_config.get("num_local_experts")
141
+ or self.llm_config.get("num_experts")
142
+ or self.llm_config.get("n_routed_experts")
143
+ or 8
144
+ )
145
+ self.active_experts = (
146
+ self.llm_config.get("num_experts_per_tok")
147
+ or self.llm_config.get("num_experts_per_token")
148
+ or 2
149
+ )
150
+ elif "notes" in self.config and "moe" in self.config["notes"]:
151
+ moe_cfg = self.config["notes"]["moe"]
152
+ self.is_moe = True
153
+ self.num_experts = moe_cfg.get("num_local_experts", 8)
154
+ self.active_experts = moe_cfg.get("num_experts_per_tok", 2)
155
+
156
+ def calculate_params(self):
157
+ # If we got exact params from safetensors, use that
158
+ if self.total_params_safetensors:
159
+ self.total_params = self.total_params_safetensors
160
  else:
161
+ # Fallback calculation
162
+ self.params_embed = self.vocab_size * self.hidden_size
163
+ head_dim = self.hidden_size // self.num_heads
164
+ kv_dim = head_dim * self.num_kv_heads
165
+
166
+ self.params_attn = (
167
+ (self.hidden_size * self.hidden_size)
168
+ + (self.hidden_size * kv_dim) * 2
169
+ + (self.hidden_size * self.hidden_size)
170
+ )
171
+ dense_mlp = 3 * self.hidden_size * self.intermediate_size
172
 
173
+ if self.is_moe:
174
+ mlp_total = dense_mlp * self.num_experts
175
+ else:
176
+ mlp_total = dense_mlp
 
 
 
177
 
178
+ self.params_norm = 2 * self.hidden_size
179
+ self.params_layer_total = (
180
+ self.params_attn + mlp_total + self.params_norm
181
+ )
182
+ self.total_params = self.params_embed + (
183
+ self.num_layers * self.params_layer_total
184
+ )
185
+
186
+ # Active Params Calculation (using improved heuristic for MoE)
187
+ if self.is_moe:
188
+ expert_param_fraction = 0.8 # 80% of params are in experts
189
+ always_active = self.total_params * (1 - expert_param_fraction)
190
+ expert_params = self.total_params * expert_param_fraction
191
+ expert_ratio = self.active_experts / self.num_experts
192
+ self.active_params = int(
193
+ always_active + (expert_params * expert_ratio)
194
+ )
195
+ else:
196
+ self.active_params = self.total_params
197
 
198
 
199
  # --- Calculation Engine ---
 
206
  context_in,
207
  context_out,
208
  quantization,
209
+ gpu_overhead_pct,
210
+ rag_enabled,
211
+ rag_model_key,
212
+ reranker_model_key,
213
  ):
214
  analyzer = ModelAnalyzer(model_name_or_repo, hf_token)
215
  if analyzer.error:
 
220
 
221
  gpu_spec = HARDWARE_DB[gpu_name]
222
 
223
+ # 2. Interconnect & Bandwidth Logic
224
  nvlink_bw = gpu_spec.get("interconnect_bw_gb_s", 0)
225
  pcie_bw = gpu_spec.get("pcie_bw_gb_s", 64)
226
+ gpu_has_nvlink = nvlink_bw > 0
227
 
228
  if connectivity_type == "NVLink":
229
+ if not gpu_has_nvlink:
 
230
  return error_result(f"Error: {gpu_name} does not support NVLink.")
231
+ using_nvlink = True
232
+ interconnect_bw_effective = nvlink_bw * INTERCONNECT_EFFICIENCY * 1e9
233
  elif connectivity_type == "PCIe / Standard":
234
+ using_nvlink = False
235
+ interconnect_bw_effective = pcie_bw * 1e9 # PCIe usually raw
236
  else: # Auto
237
+ using_nvlink = gpu_has_nvlink
238
+ interconnect_bw_effective = (
239
+ (nvlink_bw if using_nvlink else pcie_bw) * 1e9
240
+ )
241
 
242
  # --- Precision ---
243
  fp4_supported = gpu_spec.get("fp4_supported", False)
 
253
  else:
254
  bytes_per_param = 2
255
 
256
+ # --- MEMORY CALCULATION ---
257
+
258
+ # Static Footprint
259
  mem_weights = analyzer.total_params * bytes_per_param
260
 
261
+ # RAG Memory (Embedding + Reranker)
262
+ mem_rag = 0
263
+ if rag_enabled:
264
+ embed_gb = EMBEDDING_MODELS.get(rag_model_key, 0.6)
265
+ rerank_gb = RERANKER_MODELS.get(reranker_model_key, 0.5)
266
+ mem_rag = (embed_gb + rerank_gb) * (1024**3)
267
+
268
+ static_footprint = mem_weights + mem_rag
269
+
270
+ # Dynamic Footprint (KV + Activation per user)
271
  head_dim = analyzer.hidden_size // analyzer.num_heads
272
  total_tokens = context_in + context_out
273
+
274
+ # KV Cache
275
+ kv_bytes = 2
276
+ mem_kv_per_user = (
277
  2
278
  * analyzer.num_layers
279
  * analyzer.num_kv_heads
280
  * head_dim
281
  * total_tokens
282
+ * kv_bytes
 
283
  )
284
 
285
+ # Activation buffer
286
+ mem_act_per_user = ACTIVATION_MEMORY_BUFFER_GB * 1024**3
287
 
288
+ dynamic_per_user = mem_kv_per_user + mem_act_per_user
289
+ total_dynamic = dynamic_per_user * concurrent_users
290
 
291
+ # Total & Overhead
292
+ raw_total_mem = static_footprint + total_dynamic
293
+ total_mem_required = raw_total_mem * (1 + gpu_overhead_pct / 100)
294
+
295
+ gpu_mem_capacity = gpu_spec["memory_gb"] * (1024**3)
296
  num_gpus = math.ceil(total_mem_required / gpu_mem_capacity)
297
 
298
+ # --- LATENCY CALCULATION ---
299
  compute_mode = "fp16_tflops_dense"
300
+ single_gpu_flops = (
301
+ gpu_spec.get(compute_mode, 100) * 1e12 * COMPUTE_EFFICIENCY
302
  )
303
  if quantization == "FP4":
304
+ single_gpu_flops *= 2.5
305
 
306
+ single_gpu_bw = (
307
+ gpu_spec.get("bandwidth_gb_s", 1000) * 1e9 * MEMORY_EFFICIENCY
308
  )
309
 
310
+ if num_gpus == 1:
311
+ effective_flops = single_gpu_flops
312
+ effective_mem_bw = single_gpu_bw
313
+ ttft_penalty = 2.0
314
+ itl_penalty = 1.0
315
+ elif using_nvlink:
316
+ effective_flops = single_gpu_flops * num_gpus
317
+ effective_mem_bw = single_gpu_bw * num_gpus
318
+ ttft_penalty = 2.0
319
+ itl_penalty = 1.0
320
+ else:
321
+ # PCIe Bottleneck Logic
322
+ effective_flops = single_gpu_flops * num_gpus
323
+ effective_mem_bw = single_gpu_bw # Capped at single card
324
+ n = num_gpus
325
+ ttft_penalty = 1.2 * n * n - n
326
+ itl_penalty = n
327
+
328
+ # TTFT (Prefill) + RAG Latency
329
+
330
+ # 1. RAG Processing (Embedding + Reranking)
331
+ t_rag_processing = 0
332
+ if rag_enabled:
333
+ # Base Embedding Latency (Encode Query)
334
+ if "Mini" in rag_model_key:
335
+ t_rag_processing += 0.02
336
+ elif "Large" in rag_model_key:
337
+ t_rag_processing += 0.05
338
+ elif "LLM" in rag_model_key:
339
+ t_rag_processing += 0.15
340
+ else:
341
+ t_rag_processing += 0.03
342
+
343
+ # Reranking Latency (Process Documents)
344
+ if "None" not in reranker_model_key:
345
+ if "Small" in reranker_model_key:
346
+ t_rag_processing += 0.15 # 150ms
347
+ elif "Large" in reranker_model_key:
348
+ t_rag_processing += 0.35 # 350ms
349
+ elif "LLM" in reranker_model_key:
350
+ t_rag_processing += 0.80 # 800ms
351
+
352
+ # 2. LLM Compute Time
353
  prefill_ops = 2 * analyzer.active_params * context_in * concurrent_users
354
+ t_compute_prefill = (prefill_ops / effective_flops) * ttft_penalty
355
+ t_mem_prefill = mem_weights / effective_mem_bw
 
 
 
 
 
 
 
 
356
 
357
+ ttft = max(t_compute_prefill, t_mem_prefill) + t_rag_processing
 
 
358
 
359
+ # ITL (Decode)
360
+ gen_ops = 2 * analyzer.active_params * concurrent_users
361
+ t_compute_gen = (gen_ops / effective_flops) * itl_penalty
362
+ bytes_per_step = mem_weights + (total_dynamic / concurrent_users)
363
+ t_mem_gen = (bytes_per_step / effective_mem_bw) * itl_penalty
364
+ itl = max(t_compute_gen, t_mem_gen)
 
 
 
 
 
365
 
366
  # --- Result Formatting ---
367
  server_name = gpu_spec.get("recommended_server", "Contact Lenovo Support")
 
369
  server_name += " (Requires Multi-Node Clustering)"
370
 
371
  warnings = []
372
+ if not using_nvlink and num_gpus > 1:
373
  warnings.append(
374
+ f"⚠️ No NVLink: Effective Bandwidth capped at {gpu_spec['bandwidth_gb_s']} GB/s. High latency penalty."
375
  )
376
  if itl > 0.150:
377
  warnings.append(
378
+ f"⚠️ High Latency: ITL is {itl * 1000:.0f}ms (>150ms)."
379
+ )
380
+ if t_rag_processing > 0.5:
381
+ warnings.append(
382
+ f"⚠️ High RAG Latency: Reranking is adding {t_rag_processing * 1000:.0f}ms to TTFT."
383
  )
384
  if analyzer.is_moe:
385
  warnings.append(
386
+ f"ℹ️ MoE Model: Active params {analyzer.active_params / 1e9:.1f}B used for compute."
387
+ )
388
+ if rag_enabled:
389
+ warnings.append(
390
+ f"ℹ️ RAG Enabled: Allocating {mem_rag / (1024**3):.1f}GB for Models (Embed+Rerank)."
391
  )
392
 
393
  # Chart (Per GPU)
394
+ overhead_bytes = raw_total_mem * (gpu_overhead_pct / 100)
395
  fig = create_mem_chart_per_gpu(
396
+ mem_weights,
397
+ mem_rag,
398
+ total_dynamic,
399
+ overhead_bytes,
400
+ gpu_mem_capacity,
401
+ num_gpus,
402
  )
403
 
404
  # Textual memory breakdown for accessibility (WCAG 1.1.1 - Text Alternatives)
405
  w_per_gb = (mem_weights / num_gpus) / (1024**3)
406
+ r_per_gb = (mem_rag / num_gpus) / (1024**3)
407
+ d_per_gb = (total_dynamic / num_gpus) / (1024**3)
408
+ o_per_gb = (overhead_bytes / num_gpus) / (1024**3)
409
  cap_gb = gpu_mem_capacity / (1024**3)
410
+ used_gb = w_per_gb + r_per_gb + d_per_gb + o_per_gb
411
  free_gb = max(0, cap_gb - used_gb)
412
  total_used_pct = (used_gb / cap_gb * 100) if cap_gb > 0 else 0
413
 
414
  # Calculate percentages for display
415
  w_pct = (w_per_gb / cap_gb * 100) if cap_gb > 0 else 0
416
+ r_pct = (r_per_gb / cap_gb * 100) if cap_gb > 0 else 0
417
+ d_pct = (d_per_gb / cap_gb * 100) if cap_gb > 0 else 0
418
  o_pct = (o_per_gb / cap_gb * 100) if cap_gb > 0 else 0
419
  free_pct = (free_gb / cap_gb * 100) if cap_gb > 0 else 0
420
 
421
  mem_text_alt = (
422
  f"Per-GPU Memory Breakdown (Total Capacity: {cap_gb:.0f} GB):\n"
423
  f"• Weights: {w_per_gb:.1f} GB ({w_pct:.1f}%) - Model parameters stored in memory. Fixed size based on model architecture and quantization.\n"
424
+ f"• RAG Models: {r_per_gb:.1f} GB ({r_pct:.1f}%) - Embedding and reranker models. Only allocated if RAG is enabled.\n"
425
+ f"• Dynamic (KV+Act): {d_per_gb:.1f} GB ({d_pct:.1f}%) - KV cache and activation buffers. Grows with concurrent users, input context length, and output tokens.\n"
426
+ f"• Overhead: {o_per_gb:.1f} GB ({o_pct:.1f}%) - CUDA context, memory fragmentation, and system buffers. Configurable percentage of total memory.\n"
427
  f"• Free: {free_gb:.1f} GB ({free_pct:.1f}%) - Available memory headroom for additional operations."
428
  )
429
 
430
  return (
431
+ f"{analyzer.total_params / 1e9:.1f}B (Active: {analyzer.active_params / 1e9:.1f}B)",
432
  f"{total_mem_required / (1024**3):.1f} GB",
433
  num_gpus,
434
  f"{ttft * 1000:.0f} ms",
 
440
  )
441
 
442
 
443
+ def create_mem_chart_per_gpu(
444
+ weights, rag, dynamic, overhead, single_gpu_cap, num_gpus
445
+ ):
446
  # Normalize to Per-GPU view
447
  w_per = (weights / num_gpus) / (1024**3)
448
+ r_per = (rag / num_gpus) / (1024**3)
449
+ d_per = (dynamic / num_gpus) / (1024**3)
450
  o_per = (overhead / num_gpus) / (1024**3)
451
  cap_gb = single_gpu_cap / (1024**3)
452
 
453
+ used = w_per + r_per + d_per + o_per
454
  free = max(0, cap_gb - used)
455
 
456
  # Modern, accessible color palette (WCAG AA compliant)
457
+ labels = ["Weights", "RAG Models", "Dynamic (KV+Act)", "Overhead", "Free (Per GPU)"]
458
+ values = [w_per, r_per, d_per, o_per, free]
459
+
460
+ # Filter out zero values for cleaner chart
461
+ clean_labels = []
462
+ clean_values = []
463
+ colors_full = ["#4A90E2", "#10b981", "#8b5cf6", "#f59e0b", "#BDC3C7"]
464
+ clean_colors = []
465
 
466
+ for i, val in enumerate(values):
467
+ if val > 0.05: # Only show if > 50MB
468
+ clean_labels.append(labels[i])
469
+ clean_values.append(val)
470
+ clean_colors.append(colors_full[i])
471
+
472
+ # Professional color palette: Blue, Green, Purple, Orange, Gray
473
+ colors = clean_colors if clean_colors else colors_full[: len(clean_values)]
474
 
475
  # Calculate percentages for hover text
476
+ total = sum(clean_values) if clean_values else sum(values)
477
+ percentages = [
478
+ (v / total * 100) if total > 0 else 0
479
+ for v in (clean_values if clean_values else values)
480
+ ]
481
 
482
  # Create hover text with detailed information
483
+ display_labels = clean_labels if clean_labels else labels
484
+ display_values = clean_values if clean_values else values
485
  hover_texts = [
486
+ f"{display_labels[i]}<br>"
487
+ f"Value: {display_values[i]:.1f} GB<br>"
488
  f"Percentage: {percentages[i]:.1f}%<br>"
489
  f"Capacity: {cap_gb:.0f} GB"
490
+ for i in range(len(display_labels))
491
  ]
492
 
493
  # Create donut chart using plotly
494
  fig = go.Figure(
495
  data=[
496
  go.Pie(
497
+ labels=display_labels,
498
+ values=display_values,
499
  hole=0.5, # Creates the donut (hole in the middle)
500
  marker=dict(colors=colors, line=dict(color="#FFFFFF", width=2)),
501
  textinfo="label+percent",
 
514
  "xanchor": "center",
515
  "font": {"size": 16, "family": "Arial, sans-serif"},
516
  },
517
+ showlegend=False,
 
518
  font=dict(family="Arial, sans-serif", size=12),
519
  margin=dict(l=20, r=20, t=50, b=20),
520
  height=500,
 
619
  info="Maximum number of tokens to generate per request",
620
  )
621
 
622
+ with gr.Group():
623
+ gr.Markdown("#### Retrieval Augmented Generation (RAG)")
624
+ rag_chk = gr.Checkbox(
625
+ label="Enable RAG Pipeline", value=False
626
+ )
627
+ with gr.Row():
628
+ rag_model_dd = gr.Dropdown(
629
+ choices=list(EMBEDDING_MODELS.keys()),
630
+ value="Standard (MPNet-Base/BGE-Base) ~0.6GB",
631
+ label="Embedding Model",
632
+ interactive=True,
633
+ )
634
+ rerank_model_dd = gr.Dropdown(
635
+ choices=list(RERANKER_MODELS.keys()),
636
+ value="None (Skip Reranking)",
637
+ label="Reranker Model",
638
+ interactive=True,
639
+ )
640
+
641
  gr.Markdown("## Infrastructure Configuration")
642
  gpu_keys = list(HARDWARE_DB.keys())
643
  default_gpu = gpu_keys[0] if gpu_keys else "NVIDIA H100-80GB SXM5"
 
660
  label="Quantization Precision",
661
  info="Model weight precision: FP16/BF16 (standard), INT8 (8-bit), FP4 (4-bit, requires Blackwell)",
662
  )
663
+ overhead_slider = gr.Slider(
664
+ 0,
665
+ 50,
666
+ value=20,
667
+ step=5,
668
+ label="GPU Memory Overhead %",
669
+ info="Additional memory overhead percentage for CUDA context, fragmentation, and system buffers",
670
+ )
671
 
672
  btn = gr.Button("Calculate Sizing", variant="primary", size="lg")
673
 
 
728
  ctx_in,
729
  ctx_out,
730
  quant_select,
731
+ overhead_slider,
732
+ rag_chk,
733
+ rag_model_dd,
734
+ rerank_model_dd,
735
  ],
736
  outputs=[
737
  res_params,
hardware_data.yaml CHANGED
@@ -1,13 +1,33 @@
1
  gpus:
2
- - name: "NVIDIA A100-80GB SXM"
3
- memory_gb: 80
4
- bandwidth_gb_s: 2039
5
- fp16_tflops_dense: 312
6
- interconnect_bw_gb_s: 600
7
  pcie_bw_gb_s: 64
8
  fp4_supported: false
9
- recommended_server: "Lenovo ThinkSystem SR670 V2 / SR675 V3"
10
- cost_tier: "High"
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
11
 
12
  - name: "NVIDIA A100-80GB PCIe"
13
  memory_gb: 80
@@ -17,7 +37,17 @@ gpus:
17
  pcie_bw_gb_s: 64
18
  fp4_supported: false
19
  recommended_server: "Lenovo ThinkSystem SR650 V3 / SR670 V2"
20
- cost_tier: "Medium-High"
 
 
 
 
 
 
 
 
 
 
21
 
22
  - name: "NVIDIA H100-80GB SXM5"
23
  memory_gb: 80
@@ -27,7 +57,7 @@ gpus:
27
  pcie_bw_gb_s: 128
28
  fp4_supported: true
29
  recommended_server: "Lenovo ThinkSystem SR675 V3 / SR680a V3"
30
- cost_tier: "Premium"
31
 
32
  - name: "NVIDIA H100 NVL (PCIe Pair)"
33
  memory_gb: 94
@@ -37,7 +67,7 @@ gpus:
37
  pcie_bw_gb_s: 128
38
  fp4_supported: true
39
  recommended_server: "Lenovo ThinkSystem SR675 V3"
40
- cost_tier: "Premium"
41
 
42
  - name: "NVIDIA H200-141GB SXM"
43
  memory_gb: 141
@@ -47,17 +77,7 @@ gpus:
47
  pcie_bw_gb_s: 128
48
  fp4_supported: true
49
  recommended_server: "Lenovo ThinkSystem SR675 V3 / SR680a V3"
50
- cost_tier: "Premium+"
51
-
52
- - name: "NVIDIA RTX 6000 Ada"
53
- memory_gb: 48
54
- bandwidth_gb_s: 960
55
- fp16_tflops_dense: 91
56
- interconnect_bw_gb_s: 0
57
- pcie_bw_gb_s: 64
58
- fp4_supported: false
59
- recommended_server: "Lenovo ThinkStation PX / ThinkSystem SR650 V3"
60
- cost_tier: "Entry-Ent"
61
 
62
  - name: "NVIDIA B200 (Blackwell)"
63
  memory_gb: 192
@@ -77,4 +97,4 @@ gpus:
77
  pcie_bw_gb_s: 256
78
  fp4_supported: true
79
  recommended_server: "Lenovo ThinkSystem SR780a V3 (Liquid Cooled)"
80
- cost_tier: "Hyperscale"
 
1
  gpus:
2
+ - name: "NVIDIA L4-24GB"
3
+ memory_gb: 24
4
+ bandwidth_gb_s: 300
5
+ fp16_tflops_dense: 30
6
+ interconnect_bw_gb_s: 0
7
  pcie_bw_gb_s: 64
8
  fp4_supported: false
9
+ recommended_server: "Lenovo ThinkSystem SR650 V3 / ThinkEdge SE350"
10
+ cost_tier: "Entry"
11
+
12
+ - name: "NVIDIA RTX 6000 Ada"
13
+ memory_gb: 48
14
+ bandwidth_gb_s: 960
15
+ fp16_tflops_dense: 91
16
+ interconnect_bw_gb_s: 0
17
+ pcie_bw_gb_s: 64
18
+ fp4_supported: false
19
+ recommended_server: "Lenovo ThinkStation PX / ThinkSystem SR650 V3"
20
+ cost_tier: "Mid-Range"
21
+
22
+ - name: "NVIDIA L40S-48GB"
23
+ memory_gb: 48
24
+ bandwidth_gb_s: 864
25
+ fp16_tflops_dense: 362
26
+ interconnect_bw_gb_s: 0
27
+ pcie_bw_gb_s: 64
28
+ fp4_supported: true
29
+ recommended_server: "Lenovo ThinkSystem SR675 V3 / SR650 V3"
30
+ cost_tier: "Mid-Range"
31
 
32
  - name: "NVIDIA A100-80GB PCIe"
33
  memory_gb: 80
 
37
  pcie_bw_gb_s: 64
38
  fp4_supported: false
39
  recommended_server: "Lenovo ThinkSystem SR650 V3 / SR670 V2"
40
+ cost_tier: "Mid-Range"
41
+
42
+ - name: "NVIDIA A100-80GB SXM"
43
+ memory_gb: 80
44
+ bandwidth_gb_s: 2039
45
+ fp16_tflops_dense: 312
46
+ interconnect_bw_gb_s: 600
47
+ pcie_bw_gb_s: 64
48
+ fp4_supported: false
49
+ recommended_server: "Lenovo ThinkSystem SR670 V2 / SR675 V3"
50
+ cost_tier: "High-Performance"
51
 
52
  - name: "NVIDIA H100-80GB SXM5"
53
  memory_gb: 80
 
57
  pcie_bw_gb_s: 128
58
  fp4_supported: true
59
  recommended_server: "Lenovo ThinkSystem SR675 V3 / SR680a V3"
60
+ cost_tier: "High-Performance"
61
 
62
  - name: "NVIDIA H100 NVL (PCIe Pair)"
63
  memory_gb: 94
 
67
  pcie_bw_gb_s: 128
68
  fp4_supported: true
69
  recommended_server: "Lenovo ThinkSystem SR675 V3"
70
+ cost_tier: "High-Performance"
71
 
72
  - name: "NVIDIA H200-141GB SXM"
73
  memory_gb: 141
 
77
  pcie_bw_gb_s: 128
78
  fp4_supported: true
79
  recommended_server: "Lenovo ThinkSystem SR675 V3 / SR680a V3"
80
+ cost_tier: "High-Performance"
 
 
 
 
 
 
 
 
 
 
81
 
82
  - name: "NVIDIA B200 (Blackwell)"
83
  memory_gb: 192
 
97
  pcie_bw_gb_s: 256
98
  fp4_supported: true
99
  recommended_server: "Lenovo ThinkSystem SR780a V3 (Liquid Cooled)"
100
+ cost_tier: "Next-Gen"
models.yaml CHANGED
@@ -69,3 +69,55 @@ models:
69
  intermediate_size: 2880
70
  num_local_experts: 128
71
  num_experts_per_tok: 4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
69
  intermediate_size: 2880
70
  num_local_experts: 128
71
  num_experts_per_tok: 4
72
+
73
+ "Qwen/Qwen3-VL-235B-A22B-Thinking":
74
+ text_config:
75
+ hidden_size: 8192
76
+ num_hidden_layers: 96
77
+ num_attention_heads: 64
78
+ num_key_value_heads: 8
79
+ vocab_size: 151936
80
+ max_position_embeddings: 262144
81
+ intermediate_size: 24576
82
+ torch_dtype: bfloat16
83
+ notes:
84
+ moe:
85
+ num_experts: 512
86
+ num_experts_per_tok: 10
87
+
88
+ "Qwen/Qwen3-VL-235B-A22B-Instruct":
89
+ text_config:
90
+ hidden_size: 8192
91
+ num_hidden_layers: 96
92
+ num_attention_heads: 64
93
+ num_key_value_heads: 8
94
+ vocab_size: 151936
95
+ max_position_embeddings: 262144
96
+ intermediate_size: 24576
97
+ torch_dtype: bfloat16
98
+ notes:
99
+ moe:
100
+ num_experts: 512
101
+ num_experts_per_tok: 10
102
+
103
+ "Qwen/Qwen3-VL-30B-A3B-Thinking":
104
+ text_config:
105
+ hidden_size: 6144
106
+ num_hidden_layers: 80
107
+ num_attention_heads: 48
108
+ num_key_value_heads: 8
109
+ vocab_size: 151936
110
+ max_position_embeddings: 262144
111
+ intermediate_size: 16384
112
+ torch_dtype: bfloat16
113
+
114
+ "Qwen/Qwen3-VL-30B-A3B-Instruct":
115
+ text_config:
116
+ hidden_size: 6144
117
+ num_hidden_layers: 80
118
+ num_attention_heads: 48
119
+ num_key_value_heads: 8
120
+ vocab_size: 151936
121
+ max_position_embeddings: 262144
122
+ intermediate_size: 16384
123
+ torch_dtype: bfloat16