Spaces:

tog
/

GPUguesstimator

Sleeping

App Files Files Community

GPUguesstimator / app.py

tog

feat: add RAG support, improve parameter calculation, and enhance UI

1968f0b 2 months ago

raw

history blame contribute delete

26.3 kB

	import gradio as gr
	import yaml
	import math
	import matplotlib.pyplot as plt
	import plotly.graph_objects as go
	import os
	import json
	from huggingface_hub import hf_hub_download, HfApi

	# --- Configuration & Constants ---
	HARDWARE_FILE = "hardware_data.yaml"
	MODELS_FILE = "models.yaml"

	# Physics Constants
	COMPUTE_EFFICIENCY = 0.45
	MEMORY_EFFICIENCY = 0.70
	INTERCONNECT_EFFICIENCY = 0.65

	# Defaults
	ACTIVATION_MEMORY_BUFFER_GB = 0.5
	DEFAULT_GPU_OVERHEAD_PCT = 20

	# Embedding Models VRAM Est. (Weights + Runtime Buffer)
	EMBEDDING_MODELS = {
	"External/API (No Local VRAM)": 0.0,
	"Mini (All-MiniLM-L6) ~0.2GB": 0.2,
	"Standard (MPNet-Base/BGE-Base) ~0.6GB": 0.6,
	"Large (BGE-M3/GTE-Large) ~2.5GB": 2.5,
	"LLM-Based (E5-Mistral-7B) ~16GB": 16.0,
	}

	# Reranker Models VRAM Est. (Weights + Batch Processing Buffer)
	RERANKER_MODELS = {
	"None (Skip Reranking)": 0.0,
	"Small (BGE-Reranker-Base) ~0.5GB": 0.5,
	"Large (BGE-Reranker-Large) ~1.5GB": 1.5,
	"LLM-Based (BGE-Reranker-v2-Gemma) ~10GB": 10.0,
	}


	# --- Data Loading ---
	def load_hardware_data():
	if not os.path.exists(HARDWARE_FILE):
	return {}
	with open(HARDWARE_FILE, "r") as f:
	data = yaml.safe_load(f)
	return {gpu["name"]: gpu for gpu in data["gpus"]}


	def load_models_data():
	if not os.path.exists(MODELS_FILE):
	return {}
	with open(MODELS_FILE, "r") as f:
	data = yaml.safe_load(f) or {}
	return data.get("models", {})


	HARDWARE_DB = load_hardware_data()
	MODELS_DB = load_models_data()


	# --- Model Analysis ---
	class ModelAnalyzer:
	def __init__(self, repo_id, hf_token=None):
	self.repo_id = repo_id
	self.config = {}
	self.error = None
	self.api = HfApi(token=hf_token.strip() if hf_token else None)

	# 1. Try to get Model Info (Total Params) from API first
	self.total_params_safetensors = None
	try:
	model_info = self.api.model_info(repo_id)
	if hasattr(model_info, "safetensors") and model_info.safetensors and "total" in model_info.safetensors:
	self.total_params_safetensors = model_info.safetensors["total"]
	except Exception:
	pass # Fallback to config parsing

	# 2. Load Config
	if repo_id in MODELS_DB:
	self.config = MODELS_DB[repo_id]
	else:
	try:
	token = hf_token.strip() if hf_token else None
	config_path = hf_hub_download(
	repo_id=repo_id, filename="config.json", token=token
	)
	with open(config_path, "r") as f:
	self.config = json.load(f)
	except Exception as e:
	self.error = f"Failed to fetch model: {str(e)}"
	return

	try:
	# Handle nested configs (common in multimodal)
	if "text_config" in self.config:
	self.llm_config = self.config["text_config"]
	elif "llm_config" in self.config:
	self.llm_config = self.config["llm_config"]
	else:
	self.llm_config = self.config

	self.hidden_size = self.llm_config.get("hidden_size", 4096)
	self.num_layers = self.llm_config.get("num_hidden_layers", 32)
	self.num_heads = self.llm_config.get("num_attention_heads", 32)
	self.num_kv_heads = self.llm_config.get("num_key_value_heads", self.num_heads)
	self.vocab_size = self.llm_config.get("vocab_size", 32000)
	self.max_context = self.llm_config.get("max_position_embeddings", 4096)
	self.intermediate_size = self.llm_config.get(
	"intermediate_size", self.hidden_size * 4
	)

	# MoE detection
	self.is_moe = False
	self.num_experts = 1
	self.active_experts = 1

	# Check for MoE config patterns
	self._detect_moe()

	# Calculate Parameters
	self.calculate_params()

	except Exception as e:
	self.error = f"Error parsing config: {str(e)}"

	def _detect_moe(self):
	archs = self.config.get("architectures", [])
	keys = set(self.config.keys()) \| set(self.llm_config.keys())

	if (
	any("moe" in a.lower() for a in archs)
	or any("moe" in k.lower() for k in keys)
	or any("expert" in k.lower() for k in keys)
	):
	self.is_moe = True

	if self.is_moe:
	self.num_experts = (
	self.llm_config.get("num_local_experts")
	or self.llm_config.get("num_experts")
	or self.llm_config.get("n_routed_experts")
	or 8
	)
	self.active_experts = (
	self.llm_config.get("num_experts_per_tok")
	or self.llm_config.get("num_experts_per_token")
	or 2
	)
	elif "notes" in self.config and "moe" in self.config["notes"]:
	moe_cfg = self.config["notes"]["moe"]
	self.is_moe = True
	self.num_experts = moe_cfg.get("num_local_experts", 8)
	self.active_experts = moe_cfg.get("num_experts_per_tok", 2)

	def calculate_params(self):
	# If we got exact params from safetensors, use that
	if self.total_params_safetensors:
	self.total_params = self.total_params_safetensors
	else:
	# Fallback calculation
	self.params_embed = self.vocab_size * self.hidden_size
	head_dim = self.hidden_size // self.num_heads
	kv_dim = head_dim * self.num_kv_heads

	self.params_attn = (
	(self.hidden_size * self.hidden_size)
	+ (self.hidden_size * kv_dim) * 2
	+ (self.hidden_size * self.hidden_size)
	)
	dense_mlp = 3 * self.hidden_size * self.intermediate_size

	if self.is_moe:
	mlp_total = dense_mlp * self.num_experts
	else:
	mlp_total = dense_mlp

	self.params_norm = 2 * self.hidden_size
	self.params_layer_total = (
	self.params_attn + mlp_total + self.params_norm
	)
	self.total_params = self.params_embed + (
	self.num_layers * self.params_layer_total
	)

	# Active Params Calculation (using improved heuristic for MoE)
	if self.is_moe:
	expert_param_fraction = 0.8 # 80% of params are in experts
	always_active = self.total_params * (1 - expert_param_fraction)
	expert_params = self.total_params * expert_param_fraction
	expert_ratio = self.active_experts / self.num_experts
	self.active_params = int(
	always_active + (expert_params * expert_ratio)
	)
	else:
	self.active_params = self.total_params


	# --- Calculation Engine ---
	def calculate_dimensioning(
	model_name_or_repo,
	hf_token,
	gpu_name,
	connectivity_type,
	concurrent_users,
	context_in,
	context_out,
	quantization,
	gpu_overhead_pct,
	rag_enabled,
	rag_model_key,
	reranker_model_key,
	):
	analyzer = ModelAnalyzer(model_name_or_repo, hf_token)
	if analyzer.error:
	return error_result(analyzer.error)

	if gpu_name not in HARDWARE_DB:
	return error_result(f"GPU '{gpu_name}' not found in database.")

	gpu_spec = HARDWARE_DB[gpu_name]

	# 2. Interconnect & Bandwidth Logic
	nvlink_bw = gpu_spec.get("interconnect_bw_gb_s", 0)
	pcie_bw = gpu_spec.get("pcie_bw_gb_s", 64)
	gpu_has_nvlink = nvlink_bw > 0

	if connectivity_type == "NVLink":
	if not gpu_has_nvlink:
	return error_result(f"Error: {gpu_name} does not support NVLink.")
	using_nvlink = True
	interconnect_bw_effective = nvlink_bw * INTERCONNECT_EFFICIENCY * 1e9
	elif connectivity_type == "PCIe / Standard":
	using_nvlink = False
	interconnect_bw_effective = pcie_bw * 1e9 # PCIe usually raw
	else: # Auto
	using_nvlink = gpu_has_nvlink
	interconnect_bw_effective = (
	(nvlink_bw if using_nvlink else pcie_bw) * 1e9
	)

	# --- Precision ---
	fp4_supported = gpu_spec.get("fp4_supported", False)

	if quantization == "FP16/BF16":
	bytes_per_param = 2
	elif quantization == "INT8":
	bytes_per_param = 1
	elif quantization == "FP4":
	if not fp4_supported:
	return error_result(f"Error: {gpu_name} does not support FP4.")
	bytes_per_param = 0.5
	else:
	bytes_per_param = 2

	# --- MEMORY CALCULATION ---

	# Static Footprint
	mem_weights = analyzer.total_params * bytes_per_param

	# RAG Memory (Embedding + Reranker)
	mem_rag = 0
	if rag_enabled:
	embed_gb = EMBEDDING_MODELS.get(rag_model_key, 0.6)
	rerank_gb = RERANKER_MODELS.get(reranker_model_key, 0.5)
	mem_rag = (embed_gb + rerank_gb) * (1024**3)

	static_footprint = mem_weights + mem_rag

	# Dynamic Footprint (KV + Activation per user)
	head_dim = analyzer.hidden_size // analyzer.num_heads
	total_tokens = context_in + context_out

	# KV Cache
	kv_bytes = 2
	mem_kv_per_user = (
	2
	* analyzer.num_layers
	* analyzer.num_kv_heads
	* head_dim
	* total_tokens
	* kv_bytes
	)

	# Activation buffer
	mem_act_per_user = ACTIVATION_MEMORY_BUFFER_GB * 1024**3

	dynamic_per_user = mem_kv_per_user + mem_act_per_user
	total_dynamic = dynamic_per_user * concurrent_users

	# Total & Overhead
	raw_total_mem = static_footprint + total_dynamic
	total_mem_required = raw_total_mem * (1 + gpu_overhead_pct / 100)

	gpu_mem_capacity = gpu_spec["memory_gb"] * (1024**3)
	num_gpus = math.ceil(total_mem_required / gpu_mem_capacity)

	# --- LATENCY CALCULATION ---
	compute_mode = "fp16_tflops_dense"
	single_gpu_flops = (
	gpu_spec.get(compute_mode, 100) * 1e12 * COMPUTE_EFFICIENCY
	)
	if quantization == "FP4":
	single_gpu_flops *= 2.5

	single_gpu_bw = (
	gpu_spec.get("bandwidth_gb_s", 1000) * 1e9 * MEMORY_EFFICIENCY
	)

	if num_gpus == 1:
	effective_flops = single_gpu_flops
	effective_mem_bw = single_gpu_bw
	ttft_penalty = 2.0
	itl_penalty = 1.0
	elif using_nvlink:
	effective_flops = single_gpu_flops * num_gpus
	effective_mem_bw = single_gpu_bw * num_gpus
	ttft_penalty = 2.0
	itl_penalty = 1.0
	else:
	# PCIe Bottleneck Logic
	effective_flops = single_gpu_flops * num_gpus
	effective_mem_bw = single_gpu_bw # Capped at single card
	n = num_gpus
	ttft_penalty = 1.2 * n * n - n
	itl_penalty = n

	# TTFT (Prefill) + RAG Latency

	# 1. RAG Processing (Embedding + Reranking)
	t_rag_processing = 0
	if rag_enabled:
	# Base Embedding Latency (Encode Query)
	if "Mini" in rag_model_key:
	t_rag_processing += 0.02
	elif "Large" in rag_model_key:
	t_rag_processing += 0.05
	elif "LLM" in rag_model_key:
	t_rag_processing += 0.15
	else:
	t_rag_processing += 0.03

	# Reranking Latency (Process Documents)
	if "None" not in reranker_model_key:
	if "Small" in reranker_model_key:
	t_rag_processing += 0.15 # 150ms
	elif "Large" in reranker_model_key:
	t_rag_processing += 0.35 # 350ms
	elif "LLM" in reranker_model_key:
	t_rag_processing += 0.80 # 800ms

	# 2. LLM Compute Time
	prefill_ops = 2 * analyzer.active_params * context_in * concurrent_users
	t_compute_prefill = (prefill_ops / effective_flops) * ttft_penalty
	t_mem_prefill = mem_weights / effective_mem_bw

	ttft = max(t_compute_prefill, t_mem_prefill) + t_rag_processing

	# ITL (Decode)
	gen_ops = 2 * analyzer.active_params * concurrent_users
	t_compute_gen = (gen_ops / effective_flops) * itl_penalty
	bytes_per_step = mem_weights + (total_dynamic / concurrent_users)
	t_mem_gen = (bytes_per_step / effective_mem_bw) * itl_penalty
	itl = max(t_compute_gen, t_mem_gen)

	# --- Result Formatting ---
	server_name = gpu_spec.get("recommended_server", "Contact Lenovo Support")
	if num_gpus > 8:
	server_name += " (Requires Multi-Node Clustering)"

	warnings = []
	if not using_nvlink and num_gpus > 1:
	warnings.append(
	f"⚠️ No NVLink: Effective Bandwidth capped at {gpu_spec['bandwidth_gb_s']} GB/s. High latency penalty."
	)
	if itl > 0.150:
	warnings.append(
	f"⚠️ High Latency: ITL is {itl * 1000:.0f}ms (>150ms)."
	)
	if t_rag_processing > 0.5:
	warnings.append(
	f"⚠️ High RAG Latency: Reranking is adding {t_rag_processing * 1000:.0f}ms to TTFT."
	)
	if analyzer.is_moe:
	warnings.append(
	f"ℹ️ MoE Model: Active params {analyzer.active_params / 1e9:.1f}B used for compute."
	)
	if rag_enabled:
	warnings.append(
	f"ℹ️ RAG Enabled: Allocating {mem_rag / (1024**3):.1f}GB for Models (Embed+Rerank)."
	)

	# Chart (Per GPU)
	overhead_bytes = raw_total_mem * (gpu_overhead_pct / 100)
	fig = create_mem_chart_per_gpu(
	mem_weights,
	mem_rag,
	total_dynamic,
	overhead_bytes,
	gpu_mem_capacity,
	num_gpus,
	)

	# Textual memory breakdown for accessibility (WCAG 1.1.1 - Text Alternatives)
	w_per_gb = (mem_weights / num_gpus) / (1024**3)
	r_per_gb = (mem_rag / num_gpus) / (1024**3)
	d_per_gb = (total_dynamic / num_gpus) / (1024**3)
	o_per_gb = (overhead_bytes / num_gpus) / (1024**3)
	cap_gb = gpu_mem_capacity / (1024**3)
	used_gb = w_per_gb + r_per_gb + d_per_gb + o_per_gb
	free_gb = max(0, cap_gb - used_gb)
	total_used_pct = (used_gb / cap_gb * 100) if cap_gb > 0 else 0

	# Calculate percentages for display
	w_pct = (w_per_gb / cap_gb * 100) if cap_gb > 0 else 0
	r_pct = (r_per_gb / cap_gb * 100) if cap_gb > 0 else 0
	d_pct = (d_per_gb / cap_gb * 100) if cap_gb > 0 else 0
	o_pct = (o_per_gb / cap_gb * 100) if cap_gb > 0 else 0
	free_pct = (free_gb / cap_gb * 100) if cap_gb > 0 else 0

	mem_text_alt = (
	f"Per-GPU Memory Breakdown (Total Capacity: {cap_gb:.0f} GB):\n"
	f"• Weights: {w_per_gb:.1f} GB ({w_pct:.1f}%) - Model parameters stored in memory. Fixed size based on model architecture and quantization.\n"
	f"• RAG Models: {r_per_gb:.1f} GB ({r_pct:.1f}%) - Embedding and reranker models. Only allocated if RAG is enabled.\n"
	f"• Dynamic (KV+Act): {d_per_gb:.1f} GB ({d_pct:.1f}%) - KV cache and activation buffers. Grows with concurrent users, input context length, and output tokens.\n"
	f"• Overhead: {o_per_gb:.1f} GB ({o_pct:.1f}%) - CUDA context, memory fragmentation, and system buffers. Configurable percentage of total memory.\n"
	f"• Free: {free_gb:.1f} GB ({free_pct:.1f}%) - Available memory headroom for additional operations."
	)

	return (
	f"{analyzer.total_params / 1e9:.1f}B (Active: {analyzer.active_params / 1e9:.1f}B)",
	f"{total_mem_required / (1024**3):.1f} GB",
	num_gpus,
	f"{ttft * 1000:.0f} ms",
	f"{itl * 1000:.0f} ms",
	server_name,
	"\n".join(warnings) if warnings else "No warnings.",
	fig,
	mem_text_alt,
	)


	def create_mem_chart_per_gpu(
	weights, rag, dynamic, overhead, single_gpu_cap, num_gpus
	):
	# Normalize to Per-GPU view
	w_per = (weights / num_gpus) / (1024**3)
	r_per = (rag / num_gpus) / (1024**3)
	d_per = (dynamic / num_gpus) / (1024**3)
	o_per = (overhead / num_gpus) / (1024**3)
	cap_gb = single_gpu_cap / (1024**3)

	used = w_per + r_per + d_per + o_per
	free = max(0, cap_gb - used)

	# Modern, accessible color palette (WCAG AA compliant)
	labels = ["Weights", "RAG Models", "Dynamic (KV+Act)", "Overhead", "Free (Per GPU)"]
	values = [w_per, r_per, d_per, o_per, free]

	# Filter out zero values for cleaner chart
	clean_labels = []
	clean_values = []
	colors_full = ["#4A90E2", "#10b981", "#8b5cf6", "#f59e0b", "#BDC3C7"]
	clean_colors = []

	for i, val in enumerate(values):
	if val > 0.05: # Only show if > 50MB
	clean_labels.append(labels[i])
	clean_values.append(val)
	clean_colors.append(colors_full[i])

	# Professional color palette: Blue, Green, Purple, Orange, Gray
	colors = clean_colors if clean_colors else colors_full[: len(clean_values)]

	# Calculate percentages for hover text
	total = sum(clean_values) if clean_values else sum(values)
	percentages = [
	(v / total * 100) if total > 0 else 0
	for v in (clean_values if clean_values else values)
	]

	# Create hover text with detailed information
	display_labels = clean_labels if clean_labels else labels
	display_values = clean_values if clean_values else values
	hover_texts = [
	f"{display_labels[i]}<br>"
	f"Value: {display_values[i]:.1f} GB<br>"
	f"Percentage: {percentages[i]:.1f}%<br>"
	f"Capacity: {cap_gb:.0f} GB"
	for i in range(len(display_labels))
	]

	# Create donut chart using plotly
	fig = go.Figure(
	data=[
	go.Pie(
	labels=display_labels,
	values=display_values,
	hole=0.5, # Creates the donut (hole in the middle)
	marker=dict(colors=colors, line=dict(color="#FFFFFF", width=2)),
	textinfo="label+percent",
	textposition="outside",
	hovertemplate="%{hovertext}<extra></extra>",
	hovertext=hover_texts,
	)
	]
	)

	# Update layout for better appearance
	fig.update_layout(
	title={
	"text": f"Per-GPU Memory Usage (Capacity: {cap_gb:.0f} GB)",
	"x": 0.5,
	"xanchor": "center",
	"font": {"size": 16, "family": "Arial, sans-serif"},
	},
	showlegend=False,
	font=dict(family="Arial, sans-serif", size=12),
	margin=dict(l=20, r=20, t=50, b=20),
	height=500,
	)

	return fig


	def error_result(msg):
	# Create an empty plotly figure for error state
	empty_fig = go.Figure()
	empty_fig.add_annotation(
	text="Error: Unable to generate chart",
	xref="paper",
	yref="paper",
	x=0.5,
	y=0.5,
	showarrow=False,
	font=dict(size=14),
	)
	empty_fig.update_layout(
	title="Memory Breakdown",
	height=500,
	showlegend=False,
	)
	return (
	"Error",
	"Error",
	0,
	"-",
	"-",
	"Check Inputs",
	f"Error: {msg}",
	empty_fig,
	"Memory breakdown not available due to calculation error.",
	)


	# --- UI Setup ---
	# Custom CSS for better font rendering
	custom_css = """
	* {
	font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', 'Roboto', 'Oxygen', 'Ubuntu', 'Cantarell', 'Fira Sans', 'Droid Sans', 'Helvetica Neue', sans-serif !important;
	-webkit-font-smoothing: antialiased;
	-moz-osx-font-smoothing: grayscale;
	}
	"""

	with gr.Blocks(title="GPUguesstimator") as demo:
	gr.Markdown(
	"""
	# GPUguesstimator

	Physics-based sizing tool for calculating VRAM requirements, compute capacity, and interconnect bottlenecks for Large Language Model inference.
	"""
	)

	with gr.Row():
	with gr.Column():
	gr.Markdown("## Workload Configuration")
	model_keys = list(MODELS_DB.keys())
	model_dd = gr.Dropdown(
	choices=model_keys + ["Custom"],
	value=model_keys[0] if model_keys else "Custom",
	label="Model Preset",
	info="Select a preset model or choose Custom to enter a HuggingFace repository ID",
	)
	repo_input = gr.Textbox(
	label="HuggingFace Repository ID",
	value=model_keys[0] if model_keys else "",
	placeholder="e.g., meta-llama/Meta-Llama-3-70B-Instruct",
	info="Enter the HuggingFace model repository identifier",
	)
	hf_token = gr.Textbox(
	label="HuggingFace Token (Optional)",
	type="password",
	info="Required for accessing gated models. Leave empty for public models.",
	)

	users = gr.Slider(
	1,
	500,
	value=50,
	step=1,
	label="Concurrent Users",
	info="Number of simultaneous inference requests to handle",
	)
	ctx_in = gr.Slider(
	128,
	128000,
	value=1024,
	step=128,
	label="Input Context Length (Tokens)",
	info="Maximum number of input tokens per request",
	)
	ctx_out = gr.Slider(
	128,
	16384,
	value=256,
	step=128,
	label="Output Tokens (Generation Length)",
	info="Maximum number of tokens to generate per request",
	)

	with gr.Group():
	gr.Markdown("#### Retrieval Augmented Generation (RAG)")
	rag_chk = gr.Checkbox(
	label="Enable RAG Pipeline", value=False
	)
	with gr.Row():
	rag_model_dd = gr.Dropdown(
	choices=list(EMBEDDING_MODELS.keys()),
	value="Standard (MPNet-Base/BGE-Base) ~0.6GB",
	label="Embedding Model",
	interactive=True,
	)
	rerank_model_dd = gr.Dropdown(
	choices=list(RERANKER_MODELS.keys()),
	value="None (Skip Reranking)",
	label="Reranker Model",
	interactive=True,
	)

	gr.Markdown("## Infrastructure Configuration")
	gpu_keys = list(HARDWARE_DB.keys())
	default_gpu = gpu_keys[0] if gpu_keys else "NVIDIA H100-80GB SXM5"

	gpu_select = gr.Dropdown(
	choices=gpu_keys,
	value=default_gpu,
	label="GPU Model",
	info="Select the GPU model for inference",
	)
	conn_select = gr.Dropdown(
	choices=["Auto", "NVLink", "PCIe / Standard"],
	value="Auto",
	label="Interconnect Type",
	info="Auto uses GPU default, NVLink for high-bandwidth, PCIe for standard connections",
	)
	quant_select = gr.Dropdown(
	choices=["FP16/BF16", "INT8", "FP4"],
	value="FP16/BF16",
	label="Quantization Precision",
	info="Model weight precision: FP16/BF16 (standard), INT8 (8-bit), FP4 (4-bit, requires Blackwell)",
	)
	overhead_slider = gr.Slider(
	0,
	50,
	value=20,
	step=5,
	label="GPU Memory Overhead %",
	info="Additional memory overhead percentage for CUDA context, fragmentation, and system buffers",
	)

	btn = gr.Button("Calculate Sizing", variant="primary", size="lg")

	with gr.Column():
	gr.Markdown("## Sizing Results")
	with gr.Group():
	res_gpus = gr.Number(
	label="GPUs Required",
	precision=0,
	info="Minimum number of GPUs needed to fit the model and workload",
	)
	res_server = gr.Textbox(
	label="Recommended Lenovo Server",
	info="Suggested Lenovo server configuration",
	)
	res_vram = gr.Textbox(
	label="Total VRAM Required",
	info="Total video memory needed across all GPUs",
	)
	res_params = gr.Textbox(
	label="Model Parameters",
	info="Total number of model parameters in billions",
	)
	with gr.Row():
	res_ttft = gr.Textbox(
	label="TTFT - Time to First Token (Prefill latency)",
	info="time to process input and generate first token",
	)
	res_itl = gr.Textbox(
	label="ITL - Inter-Token Latency",
	info="time between each generated token",
	)
	res_warnings = gr.Textbox(
	label="Analysis Notes and Warnings",
	lines=4,
	info="Important notes, warnings, and recommendations about the configuration",
	)
	plot_output = gr.Plot(label="Per-GPU Memory Breakdown Chart")
	mem_text_alt = gr.Textbox(
	label="Memory Breakdown (Text Description)",
	info="Textual description of memory allocation for screen readers and accessibility",
	lines=6,
	)

	def update_repo(choice):
	return choice if choice != "Custom" else ""

	model_dd.change(update_repo, model_dd, repo_input)

	btn.click(
	calculate_dimensioning,
	inputs=[
	repo_input,
	hf_token,
	gpu_select,
	conn_select,
	users,
	ctx_in,
	ctx_out,
	quant_select,
	overhead_slider,
	rag_chk,
	rag_model_dd,
	rerank_model_dd,
	],
	outputs=[
	res_params,
	res_vram,
	res_gpus,
	res_ttft,
	res_itl,
	res_server,
	res_warnings,
	plot_output,
	mem_text_alt,
	],
	)

	if __name__ == "__main__":
	demo.launch(theme=gr.themes.Soft(), css=custom_css)