import gradio as gr
import yaml
import math
import matplotlib.pyplot as plt
import plotly.graph_objects as go
import os
import json
from huggingface_hub import hf_hub_download, HfApi

# --- Configuration & Constants ---
HARDWARE_FILE = "hardware_data.yaml"
MODELS_FILE = "models.yaml"

# Physics Constants
COMPUTE_EFFICIENCY = 0.45
MEMORY_EFFICIENCY = 0.70
INTERCONNECT_EFFICIENCY = 0.65

# Defaults
ACTIVATION_MEMORY_BUFFER_GB = 0.5
DEFAULT_GPU_OVERHEAD_PCT = 20

# Embedding Models VRAM Est. (Weights + Runtime Buffer)
EMBEDDING_MODELS = {
    "External/API (No Local VRAM)": 0.0,
    "Mini (All-MiniLM-L6) ~0.2GB": 0.2,
    "Standard (MPNet-Base/BGE-Base) ~0.6GB": 0.6,
    "Large (BGE-M3/GTE-Large) ~2.5GB": 2.5,
    "LLM-Based (E5-Mistral-7B) ~16GB": 16.0,
}

# Reranker Models VRAM Est. (Weights + Batch Processing Buffer)
RERANKER_MODELS = {
    "None (Skip Reranking)": 0.0,
    "Small (BGE-Reranker-Base) ~0.5GB": 0.5,
    "Large (BGE-Reranker-Large) ~1.5GB": 1.5,
    "LLM-Based (BGE-Reranker-v2-Gemma) ~10GB": 10.0,
}


# --- Data Loading ---
def load_hardware_data():
    if not os.path.exists(HARDWARE_FILE):
        return {}
    with open(HARDWARE_FILE, "r") as f:
        data = yaml.safe_load(f)
    return {gpu["name"]: gpu for gpu in data["gpus"]}


def load_models_data():
    if not os.path.exists(MODELS_FILE):
        return {}
    with open(MODELS_FILE, "r") as f:
        data = yaml.safe_load(f) or {}
    return data.get("models", {})


HARDWARE_DB = load_hardware_data()
MODELS_DB = load_models_data()


# --- Model Analysis ---
class ModelAnalyzer:
    def __init__(self, repo_id, hf_token=None):
        self.repo_id = repo_id
        self.config = {}
        self.error = None
        self.api = HfApi(token=hf_token.strip() if hf_token else None)

        # 1. Try to get Model Info (Total Params) from API first
        self.total_params_safetensors = None
        try:
            model_info = self.api.model_info(repo_id)
            if hasattr(model_info, "safetensors") and model_info.safetensors and "total" in model_info.safetensors:
                self.total_params_safetensors = model_info.safetensors["total"]
        except Exception:
            pass  # Fallback to config parsing

        # 2. Load Config
        if repo_id in MODELS_DB:
            self.config = MODELS_DB[repo_id]
        else:
            try:
                token = hf_token.strip() if hf_token else None
                config_path = hf_hub_download(
                    repo_id=repo_id, filename="config.json", token=token
                )
                with open(config_path, "r") as f:
                    self.config = json.load(f)
            except Exception as e:
                self.error = f"Failed to fetch model: {str(e)}"
                return

        try:
            # Handle nested configs (common in multimodal)
            if "text_config" in self.config:
                self.llm_config = self.config["text_config"]
            elif "llm_config" in self.config:
                self.llm_config = self.config["llm_config"]
            else:
                self.llm_config = self.config

            self.hidden_size = self.llm_config.get("hidden_size", 4096)
            self.num_layers = self.llm_config.get("num_hidden_layers", 32)
            self.num_heads = self.llm_config.get("num_attention_heads", 32)
            self.num_kv_heads = self.llm_config.get("num_key_value_heads", self.num_heads)
            self.vocab_size = self.llm_config.get("vocab_size", 32000)
            self.max_context = self.llm_config.get("max_position_embeddings", 4096)
            self.intermediate_size = self.llm_config.get(
                "intermediate_size", self.hidden_size * 4
            )

            # MoE detection
            self.is_moe = False
            self.num_experts = 1
            self.active_experts = 1

            # Check for MoE config patterns
            self._detect_moe()

            # Calculate Parameters
            self.calculate_params()

        except Exception as e:
            self.error = f"Error parsing config: {str(e)}"

    def _detect_moe(self):
        archs = self.config.get("architectures", [])
        keys = set(self.config.keys()) | set(self.llm_config.keys())

        if (
            any("moe" in a.lower() for a in archs)
            or any("moe" in k.lower() for k in keys)
            or any("expert" in k.lower() for k in keys)
        ):
            self.is_moe = True

        if self.is_moe:
            self.num_experts = (
                self.llm_config.get("num_local_experts")
                or self.llm_config.get("num_experts")
                or self.llm_config.get("n_routed_experts")
                or 8
            )
            self.active_experts = (
                self.llm_config.get("num_experts_per_tok")
                or self.llm_config.get("num_experts_per_token")
                or 2
            )
        elif "notes" in self.config and "moe" in self.config["notes"]:
            moe_cfg = self.config["notes"]["moe"]
            self.is_moe = True
            self.num_experts = moe_cfg.get("num_local_experts", 8)
            self.active_experts = moe_cfg.get("num_experts_per_tok", 2)

    def calculate_params(self):
        # If we got exact params from safetensors, use that
        if self.total_params_safetensors:
            self.total_params = self.total_params_safetensors
        else:
            # Fallback calculation
            self.params_embed = self.vocab_size * self.hidden_size
            head_dim = self.hidden_size // self.num_heads
            kv_dim = head_dim * self.num_kv_heads

            self.params_attn = (
                (self.hidden_size * self.hidden_size)
                + (self.hidden_size * kv_dim) * 2
                + (self.hidden_size * self.hidden_size)
            )
            dense_mlp = 3 * self.hidden_size * self.intermediate_size

            if self.is_moe:
                mlp_total = dense_mlp * self.num_experts
            else:
                mlp_total = dense_mlp

            self.params_norm = 2 * self.hidden_size
            self.params_layer_total = (
                self.params_attn + mlp_total + self.params_norm
            )
            self.total_params = self.params_embed + (
                self.num_layers * self.params_layer_total
            )

        # Active Params Calculation (using improved heuristic for MoE)
        if self.is_moe:
            expert_param_fraction = 0.8  # 80% of params are in experts
            always_active = self.total_params * (1 - expert_param_fraction)
            expert_params = self.total_params * expert_param_fraction
            expert_ratio = self.active_experts / self.num_experts
            self.active_params = int(
                always_active + (expert_params * expert_ratio)
            )
        else:
            self.active_params = self.total_params


# --- Calculation Engine ---
def calculate_dimensioning(
    model_name_or_repo,
    hf_token,
    gpu_name,
    connectivity_type,
    concurrent_users,
    context_in,
    context_out,
    quantization,
    gpu_overhead_pct,
    rag_enabled,
    rag_model_key,
    reranker_model_key,
):
    analyzer = ModelAnalyzer(model_name_or_repo, hf_token)
    if analyzer.error:
        return error_result(analyzer.error)

    if gpu_name not in HARDWARE_DB:
        return error_result(f"GPU '{gpu_name}' not found in database.")

    gpu_spec = HARDWARE_DB[gpu_name]

    # 2. Interconnect & Bandwidth Logic
    nvlink_bw = gpu_spec.get("interconnect_bw_gb_s", 0)
    pcie_bw = gpu_spec.get("pcie_bw_gb_s", 64)
    gpu_has_nvlink = nvlink_bw > 0

    if connectivity_type == "NVLink":
        if not gpu_has_nvlink:
            return error_result(f"Error: {gpu_name} does not support NVLink.")
        using_nvlink = True
        interconnect_bw_effective = nvlink_bw * INTERCONNECT_EFFICIENCY * 1e9
    elif connectivity_type == "PCIe / Standard":
        using_nvlink = False
        interconnect_bw_effective = pcie_bw * 1e9  # PCIe usually raw
    else:  # Auto
        using_nvlink = gpu_has_nvlink
        interconnect_bw_effective = (
            (nvlink_bw if using_nvlink else pcie_bw) * 1e9
        )

    # --- Precision ---
    fp4_supported = gpu_spec.get("fp4_supported", False)

    if quantization == "FP16/BF16":
        bytes_per_param = 2
    elif quantization == "INT8":
        bytes_per_param = 1
    elif quantization == "FP4":
        if not fp4_supported:
            return error_result(f"Error: {gpu_name} does not support FP4.")
        bytes_per_param = 0.5
    else:
        bytes_per_param = 2

    # --- MEMORY CALCULATION ---

    # Static Footprint
    mem_weights = analyzer.total_params * bytes_per_param

    # RAG Memory (Embedding + Reranker)
    mem_rag = 0
    if rag_enabled:
        embed_gb = EMBEDDING_MODELS.get(rag_model_key, 0.6)
        rerank_gb = RERANKER_MODELS.get(reranker_model_key, 0.5)
        mem_rag = (embed_gb + rerank_gb) * (1024**3)

    static_footprint = mem_weights + mem_rag

    # Dynamic Footprint (KV + Activation per user)
    head_dim = analyzer.hidden_size // analyzer.num_heads
    total_tokens = context_in + context_out

    # KV Cache
    kv_bytes = 2
    mem_kv_per_user = (
        2
        * analyzer.num_layers
        * analyzer.num_kv_heads
        * head_dim
        * total_tokens
        * kv_bytes
    )

    # Activation buffer
    mem_act_per_user = ACTIVATION_MEMORY_BUFFER_GB * 1024**3

    dynamic_per_user = mem_kv_per_user + mem_act_per_user
    total_dynamic = dynamic_per_user * concurrent_users

    # Total & Overhead
    raw_total_mem = static_footprint + total_dynamic
    total_mem_required = raw_total_mem * (1 + gpu_overhead_pct / 100)

    gpu_mem_capacity = gpu_spec["memory_gb"] * (1024**3)
    num_gpus = math.ceil(total_mem_required / gpu_mem_capacity)

    # --- LATENCY CALCULATION ---
    compute_mode = "fp16_tflops_dense"
    single_gpu_flops = (
        gpu_spec.get(compute_mode, 100) * 1e12 * COMPUTE_EFFICIENCY
    )
    if quantization == "FP4":
        single_gpu_flops *= 2.5

    single_gpu_bw = (
        gpu_spec.get("bandwidth_gb_s", 1000) * 1e9 * MEMORY_EFFICIENCY
    )

    if num_gpus == 1:
        effective_flops = single_gpu_flops
        effective_mem_bw = single_gpu_bw
        ttft_penalty = 2.0
        itl_penalty = 1.0
    elif using_nvlink:
        effective_flops = single_gpu_flops * num_gpus
        effective_mem_bw = single_gpu_bw * num_gpus
        ttft_penalty = 2.0
        itl_penalty = 1.0
    else:
        # PCIe Bottleneck Logic
        effective_flops = single_gpu_flops * num_gpus
        effective_mem_bw = single_gpu_bw  # Capped at single card
        n = num_gpus
        ttft_penalty = 1.2 * n * n - n
        itl_penalty = n

    # TTFT (Prefill) + RAG Latency

    # 1. RAG Processing (Embedding + Reranking)
    t_rag_processing = 0
    if rag_enabled:
        # Base Embedding Latency (Encode Query)
        if "Mini" in rag_model_key:
            t_rag_processing += 0.02
        elif "Large" in rag_model_key:
            t_rag_processing += 0.05
        elif "LLM" in rag_model_key:
            t_rag_processing += 0.15
        else:
            t_rag_processing += 0.03

        # Reranking Latency (Process Documents)
        if "None" not in reranker_model_key:
            if "Small" in reranker_model_key:
                t_rag_processing += 0.15  # 150ms
            elif "Large" in reranker_model_key:
                t_rag_processing += 0.35  # 350ms
            elif "LLM" in reranker_model_key:
                t_rag_processing += 0.80  # 800ms

    # 2. LLM Compute Time
    prefill_ops = 2 * analyzer.active_params * context_in * concurrent_users
    t_compute_prefill = (prefill_ops / effective_flops) * ttft_penalty
    t_mem_prefill = mem_weights / effective_mem_bw

    ttft = max(t_compute_prefill, t_mem_prefill) + t_rag_processing

    # ITL (Decode)
    gen_ops = 2 * analyzer.active_params * concurrent_users
    t_compute_gen = (gen_ops / effective_flops) * itl_penalty
    bytes_per_step = mem_weights + (total_dynamic / concurrent_users)
    t_mem_gen = (bytes_per_step / effective_mem_bw) * itl_penalty
    itl = max(t_compute_gen, t_mem_gen)

    # --- Result Formatting ---
    server_name = gpu_spec.get("recommended_server", "Contact Lenovo Support")
    if num_gpus > 8:
        server_name += " (Requires Multi-Node Clustering)"

    warnings = []
    if not using_nvlink and num_gpus > 1:
        warnings.append(
            f"⚠️ No NVLink: Effective Bandwidth capped at {gpu_spec['bandwidth_gb_s']} GB/s. High latency penalty."
        )
    if itl > 0.150:
        warnings.append(
            f"⚠️ High Latency: ITL is {itl * 1000:.0f}ms (>150ms)."
        )
    if t_rag_processing > 0.5:
        warnings.append(
            f"⚠️ High RAG Latency: Reranking is adding {t_rag_processing * 1000:.0f}ms to TTFT."
        )
    if analyzer.is_moe:
        warnings.append(
            f"ℹ️ MoE Model: Active params {analyzer.active_params / 1e9:.1f}B used for compute."
        )
    if rag_enabled:
        warnings.append(
            f"ℹ️ RAG Enabled: Allocating {mem_rag / (1024**3):.1f}GB for Models (Embed+Rerank)."
        )

    # Chart (Per GPU)
    overhead_bytes = raw_total_mem * (gpu_overhead_pct / 100)
    fig = create_mem_chart_per_gpu(
        mem_weights,
        mem_rag,
        total_dynamic,
        overhead_bytes,
        gpu_mem_capacity,
        num_gpus,
    )

    # Textual memory breakdown for accessibility (WCAG 1.1.1 - Text Alternatives)
    w_per_gb = (mem_weights / num_gpus) / (1024**3)
    r_per_gb = (mem_rag / num_gpus) / (1024**3)
    d_per_gb = (total_dynamic / num_gpus) / (1024**3)
    o_per_gb = (overhead_bytes / num_gpus) / (1024**3)
    cap_gb = gpu_mem_capacity / (1024**3)
    used_gb = w_per_gb + r_per_gb + d_per_gb + o_per_gb
    free_gb = max(0, cap_gb - used_gb)
    total_used_pct = (used_gb / cap_gb * 100) if cap_gb > 0 else 0

    # Calculate percentages for display
    w_pct = (w_per_gb / cap_gb * 100) if cap_gb > 0 else 0
    r_pct = (r_per_gb / cap_gb * 100) if cap_gb > 0 else 0
    d_pct = (d_per_gb / cap_gb * 100) if cap_gb > 0 else 0
    o_pct = (o_per_gb / cap_gb * 100) if cap_gb > 0 else 0
    free_pct = (free_gb / cap_gb * 100) if cap_gb > 0 else 0

    mem_text_alt = (
        f"Per-GPU Memory Breakdown (Total Capacity: {cap_gb:.0f} GB):\n"
        f"• Weights: {w_per_gb:.1f} GB ({w_pct:.1f}%) - Model parameters stored in memory. Fixed size based on model architecture and quantization.\n"
        f"• RAG Models: {r_per_gb:.1f} GB ({r_pct:.1f}%) - Embedding and reranker models. Only allocated if RAG is enabled.\n"
        f"• Dynamic (KV+Act): {d_per_gb:.1f} GB ({d_pct:.1f}%) - KV cache and activation buffers. Grows with concurrent users, input context length, and output tokens.\n"
        f"• Overhead: {o_per_gb:.1f} GB ({o_pct:.1f}%) - CUDA context, memory fragmentation, and system buffers. Configurable percentage of total memory.\n"
        f"• Free: {free_gb:.1f} GB ({free_pct:.1f}%) - Available memory headroom for additional operations."
    )

    return (
        f"{analyzer.total_params / 1e9:.1f}B (Active: {analyzer.active_params / 1e9:.1f}B)",
        f"{total_mem_required / (1024**3):.1f} GB",
        num_gpus,
        f"{ttft * 1000:.0f} ms",
        f"{itl * 1000:.0f} ms",
        server_name,
        "\n".join(warnings) if warnings else "No warnings.",
        fig,
        mem_text_alt,
    )


def create_mem_chart_per_gpu(
    weights, rag, dynamic, overhead, single_gpu_cap, num_gpus
):
    # Normalize to Per-GPU view
    w_per = (weights / num_gpus) / (1024**3)
    r_per = (rag / num_gpus) / (1024**3)
    d_per = (dynamic / num_gpus) / (1024**3)
    o_per = (overhead / num_gpus) / (1024**3)
    cap_gb = single_gpu_cap / (1024**3)

    used = w_per + r_per + d_per + o_per
    free = max(0, cap_gb - used)

    # Modern, accessible color palette (WCAG AA compliant)
    labels = ["Weights", "RAG Models", "Dynamic (KV+Act)", "Overhead", "Free (Per GPU)"]
    values = [w_per, r_per, d_per, o_per, free]

    # Filter out zero values for cleaner chart
    clean_labels = []
    clean_values = []
    colors_full = ["#4A90E2", "#10b981", "#8b5cf6", "#f59e0b", "#BDC3C7"]
    clean_colors = []

    for i, val in enumerate(values):
        if val > 0.05:  # Only show if > 50MB
            clean_labels.append(labels[i])
            clean_values.append(val)
            clean_colors.append(colors_full[i])

    # Professional color palette: Blue, Green, Purple, Orange, Gray
    colors = clean_colors if clean_colors else colors_full[: len(clean_values)]

    # Calculate percentages for hover text
    total = sum(clean_values) if clean_values else sum(values)
    percentages = [
        (v / total * 100) if total > 0 else 0
        for v in (clean_values if clean_values else values)
    ]

    # Create hover text with detailed information
    display_labels = clean_labels if clean_labels else labels
    display_values = clean_values if clean_values else values
    hover_texts = [
        f"{display_labels[i]}<br>"
        f"Value: {display_values[i]:.1f} GB<br>"
        f"Percentage: {percentages[i]:.1f}%<br>"
        f"Capacity: {cap_gb:.0f} GB"
        for i in range(len(display_labels))
    ]

    # Create donut chart using plotly
    fig = go.Figure(
        data=[
            go.Pie(
                labels=display_labels,
                values=display_values,
                hole=0.5,  # Creates the donut (hole in the middle)
                marker=dict(colors=colors, line=dict(color="#FFFFFF", width=2)),
                textinfo="label+percent",
                textposition="outside",
                hovertemplate="%{hovertext}<extra></extra>",
                hovertext=hover_texts,
            )
        ]
    )

    # Update layout for better appearance
    fig.update_layout(
        title={
            "text": f"Per-GPU Memory Usage (Capacity: {cap_gb:.0f} GB)",
            "x": 0.5,
            "xanchor": "center",
            "font": {"size": 16, "family": "Arial, sans-serif"},
        },
        showlegend=False,
        font=dict(family="Arial, sans-serif", size=12),
        margin=dict(l=20, r=20, t=50, b=20),
        height=500,
    )

    return fig


def error_result(msg):
    # Create an empty plotly figure for error state
    empty_fig = go.Figure()
    empty_fig.add_annotation(
        text="Error: Unable to generate chart",
        xref="paper",
        yref="paper",
        x=0.5,
        y=0.5,
        showarrow=False,
        font=dict(size=14),
    )
    empty_fig.update_layout(
        title="Memory Breakdown",
        height=500,
        showlegend=False,
    )
    return (
        "Error",
        "Error",
        0,
        "-",
        "-",
        "Check Inputs",
        f"Error: {msg}",
        empty_fig,
        "Memory breakdown not available due to calculation error.",
    )


# --- UI Setup ---
# Custom CSS for better font rendering
custom_css = """
* {
    font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', 'Roboto', 'Oxygen', 'Ubuntu', 'Cantarell', 'Fira Sans', 'Droid Sans', 'Helvetica Neue', sans-serif !important;
    -webkit-font-smoothing: antialiased;
    -moz-osx-font-smoothing: grayscale;
}
"""

with gr.Blocks(title="GPUguesstimator") as demo:
    gr.Markdown(
        """
        # GPUguesstimator

        Physics-based sizing tool for calculating VRAM requirements, compute capacity, and interconnect bottlenecks for Large Language Model inference.
        """
    )

    with gr.Row():
        with gr.Column():
            gr.Markdown("## Workload Configuration")
            model_keys = list(MODELS_DB.keys())
            model_dd = gr.Dropdown(
                choices=model_keys + ["Custom"],
                value=model_keys[0] if model_keys else "Custom",
                label="Model Preset",
                info="Select a preset model or choose Custom to enter a HuggingFace repository ID",
            )
            repo_input = gr.Textbox(
                label="HuggingFace Repository ID",
                value=model_keys[0] if model_keys else "",
                placeholder="e.g., meta-llama/Meta-Llama-3-70B-Instruct",
                info="Enter the HuggingFace model repository identifier",
            )
            hf_token = gr.Textbox(
                label="HuggingFace Token (Optional)",
                type="password",
                info="Required for accessing gated models. Leave empty for public models.",
            )

            users = gr.Slider(
                1,
                500,
                value=50,
                step=1,
                label="Concurrent Users",
                info="Number of simultaneous inference requests to handle",
            )
            ctx_in = gr.Slider(
                128,
                128000,
                value=1024,
                step=128,
                label="Input Context Length (Tokens)",
                info="Maximum number of input tokens per request",
            )
            ctx_out = gr.Slider(
                128,
                16384,
                value=256,
                step=128,
                label="Output Tokens (Generation Length)",
                info="Maximum number of tokens to generate per request",
            )

            with gr.Group():
                gr.Markdown("#### Retrieval Augmented Generation (RAG)")
                rag_chk = gr.Checkbox(
                    label="Enable RAG Pipeline", value=False
                )
                with gr.Row():
                    rag_model_dd = gr.Dropdown(
                        choices=list(EMBEDDING_MODELS.keys()),
                        value="Standard (MPNet-Base/BGE-Base) ~0.6GB",
                        label="Embedding Model",
                        interactive=True,
                    )
                    rerank_model_dd = gr.Dropdown(
                        choices=list(RERANKER_MODELS.keys()),
                        value="None (Skip Reranking)",
                        label="Reranker Model",
                        interactive=True,
                    )

            gr.Markdown("## Infrastructure Configuration")
            gpu_keys = list(HARDWARE_DB.keys())
            default_gpu = gpu_keys[0] if gpu_keys else "NVIDIA H100-80GB SXM5"

            gpu_select = gr.Dropdown(
                choices=gpu_keys,
                value=default_gpu,
                label="GPU Model",
                info="Select the GPU model for inference",
            )
            conn_select = gr.Dropdown(
                choices=["Auto", "NVLink", "PCIe / Standard"],
                value="Auto",
                label="Interconnect Type",
                info="Auto uses GPU default, NVLink for high-bandwidth, PCIe for standard connections",
            )
            quant_select = gr.Dropdown(
                choices=["FP16/BF16", "INT8", "FP4"],
                value="FP16/BF16",
                label="Quantization Precision",
                info="Model weight precision: FP16/BF16 (standard), INT8 (8-bit), FP4 (4-bit, requires Blackwell)",
            )
            overhead_slider = gr.Slider(
                0,
                50,
                value=20,
                step=5,
                label="GPU Memory Overhead %",
                info="Additional memory overhead percentage for CUDA context, fragmentation, and system buffers",
            )

            btn = gr.Button("Calculate Sizing", variant="primary", size="lg")

        with gr.Column():
            gr.Markdown("## Sizing Results")
            with gr.Group():
                res_gpus = gr.Number(
                    label="GPUs Required",
                    precision=0,
                    info="Minimum number of GPUs needed to fit the model and workload",
                )
                res_server = gr.Textbox(
                    label="Recommended Lenovo Server",
                    info="Suggested Lenovo server configuration",
                )
                res_vram = gr.Textbox(
                    label="Total VRAM Required",
                    info="Total video memory needed across all GPUs",
                )
                res_params = gr.Textbox(
                    label="Model Parameters",
                    info="Total number of model parameters in billions",
                )
                with gr.Row():
                    res_ttft = gr.Textbox(
                        label="TTFT - Time to First Token (Prefill latency)",
                        info="time to process input and generate first token",
                    )
                    res_itl = gr.Textbox(
                        label="ITL - Inter-Token Latency",
                        info="time between each generated token",
                    )
                res_warnings = gr.Textbox(
                    label="Analysis Notes and Warnings",
                    lines=4,
                    info="Important notes, warnings, and recommendations about the configuration",
                )
                plot_output = gr.Plot(label="Per-GPU Memory Breakdown Chart")
                mem_text_alt = gr.Textbox(
                    label="Memory Breakdown (Text Description)",
                    info="Textual description of memory allocation for screen readers and accessibility",
                    lines=6,
                )

    def update_repo(choice):
        return choice if choice != "Custom" else ""

    model_dd.change(update_repo, model_dd, repo_input)

    btn.click(
        calculate_dimensioning,
        inputs=[
            repo_input,
            hf_token,
            gpu_select,
            conn_select,
            users,
            ctx_in,
            ctx_out,
            quant_select,
            overhead_slider,
            rag_chk,
            rag_model_dd,
            rerank_model_dd,
        ],
        outputs=[
            res_params,
            res_vram,
            res_gpus,
            res_ttft,
            res_itl,
            res_server,
            res_warnings,
            plot_output,
            mem_text_alt,
        ],
    )

if __name__ == "__main__":
    demo.launch(theme=gr.themes.Soft(), css=custom_css)