Spaces:

Livengood
/

Instance-VRAM-Calculator

Running

Livengood Claude commited on Nov 29, 2025

Commit

548f0fb

1 Parent(s): 26bc78c

Fix UI bugs: tabs, accordion, examples, and sliders

Key fixes:
- Use gr.Tab instead of gr.TabItem for reliable tab switching
- Move all function definitions outside Blocks context
- Remove problematic hidden row pattern for search results
- Move pandas import to module level
- Simplify UI structure to avoid nesting issues
- Rename functions to avoid naming conflicts with components

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

Files changed (1) hide show

app.py +297 -529

app.py CHANGED Viewed

@@ -16,6 +16,7 @@ Fetches model metadata from HF Hub and calculates:
 import gradio as gr
 from huggingface_hub import HfApi, hf_hub_download, list_models
 import json
 from functools import lru_cache
 from datetime import datetime
@@ -142,19 +143,13 @@ def estimate_params_from_safetensors(info) -> tuple[int, str]:
 def get_head_dim(config: dict) -> int:
     """Calculate head dimension from config, with fallbacks."""
-    # Try to get it directly
     if "head_dim" in config:
         return config["head_dim"]
-    # Calculate from hidden_size and num_attention_heads
     hidden_size = config.get("hidden_size", config.get("n_embd", 0))
     num_heads = config.get("num_attention_heads", config.get("n_head", 0))
     if hidden_size and num_heads:
         return hidden_size // num_heads
-    # Common defaults by model family
-    return 128  # Most common default
 def estimate_kv_cache_size(
@@ -165,44 +160,25 @@ def estimate_kv_cache_size(
     batch_size: int = 1,
     dtype_bytes: int = 2
 ) -> int:
-    """
-    KV cache size = 2 * num_layers * batch_size * context_length * num_kv_heads * head_dim * dtype_bytes
-    The 2 accounts for both K and V caches.
-    """
-    kv_cache_bytes = 2 * num_layers * batch_size * context_length * num_kv_heads * head_dim * dtype_bytes
-    return kv_cache_bytes
-def estimate_training_memory(
-    param_count: int,
-    dtype_bytes: int,
-    optimizer: str = "AdamW"
-) -> dict:
-    """
-    Estimate training memory requirements.
-    For training, we need:
-    - Model weights
-    - Gradients (same size as weights)
-    - Optimizer states (varies by optimizer)
-    - Activations (highly variable, estimated)
-    """
     weights_bytes = param_count * dtype_bytes
     gradients_bytes = param_count * dtype_bytes
-    # Optimizer states
     if optimizer == "AdamW":
-        # AdamW stores: m (momentum), v (variance) in FP32
-        optimizer_bytes = param_count * 4 * 2  # 2 states, 4 bytes each
     elif optimizer == "SGD":
-        optimizer_bytes = 0  # No extra state (momentum optional)
     elif optimizer == "SGD + Momentum":
-        optimizer_bytes = param_count * 4  # Momentum buffer
     elif optimizer == "8-bit Adam":
-        optimizer_bytes = param_count * 1 * 2  # 2 states, 1 byte each
     else:
-        optimizer_bytes = param_count * 4 * 2  # Default to AdamW
     return {
         "weights": weights_bytes,
@@ -215,25 +191,22 @@ def estimate_training_memory(
 def calculate_multi_gpu_split(total_vram_gb: float, num_gpus: int, parallelism: str) -> dict:
     """Calculate memory distribution across multiple GPUs."""
     if parallelism == "Tensor Parallelism":
-        # Weights and KV cache split evenly
         per_gpu = total_vram_gb / num_gpus
-        overhead = 0.05 * total_vram_gb  # Communication overhead
         return {
             "per_gpu": per_gpu + (overhead / num_gpus),
             "total": total_vram_gb + overhead,
             "efficiency": "High (best for inference)",
         }
     elif parallelism == "Pipeline Parallelism":
-        # Layers distributed, but activation memory at boundaries
         per_gpu = total_vram_gb / num_gpus
-        overhead = 0.1 * total_vram_gb  # Activation memory overhead
         return {
             "per_gpu": per_gpu + (overhead / num_gpus),
             "total": total_vram_gb + overhead,
             "efficiency": "Medium (good for training)",
         }
-    else:  # Data Parallelism
-        # Full model on each GPU
         return {
             "per_gpu": total_vram_gb,
             "total": total_vram_gb * num_gpus,
@@ -249,36 +222,18 @@ def estimate_lora_memory(
     target_modules: int = 4,
     use_qlora: bool = False
 ) -> dict:
-    """
-    Estimate LoRA/QLoRA fine-tuning memory requirements.
-    LoRA adds low-rank adaptation matrices to specific layers.
-    QLoRA additionally quantizes the base model to 4-bit.
-    """
-    # Base model weights
     if use_qlora:
-        # QLoRA: 4-bit quantized weights
-        base_weights_bytes = param_count * 0.5  # 4-bit = 0.5 bytes/param
     else:
         base_weights_bytes = param_count * dtype_bytes
-    # LoRA adapter parameters (A and B matrices for each target module)
-    # Typical target modules: q_proj, k_proj, v_proj, o_proj (4 modules)
-    # Each LoRA layer: hidden_size * rank (A) + rank * hidden_size (B)
-    # Approximate as 2 * hidden_size * rank per module
-    # For simplicity, estimate based on total params
-    lora_params_ratio = (lora_rank * 2 * target_modules) / 1000  # Rough estimate
-    lora_params = int(param_count * lora_params_ratio * 0.01)  # Usually ~0.1-1% of base
     lora_weights_bytes = lora_params * dtype_bytes
-    # Gradients only for LoRA params (not frozen base)
     gradients_bytes = lora_params * dtype_bytes
-    # Optimizer states for LoRA params only
-    optimizer_bytes = lora_params * 4 * 2  # AdamW: 2 states, 4 bytes each
-    # Activations (still needed, but can use gradient checkpointing)
-    activation_bytes = base_weights_bytes * 0.5  # Reduced with checkpointing
     return {
         "base_weights": base_weights_bytes,
@@ -288,7 +243,7 @@ def estimate_lora_memory(
         "optimizer": optimizer_bytes,
         "activations": activation_bytes,
         "total": base_weights_bytes + lora_weights_bytes + gradients_bytes + optimizer_bytes + activation_bytes,
-        "vs_full_finetune_ratio": 0.3 if use_qlora else 0.5,  # Rough memory savings
     }
@@ -299,31 +254,18 @@ def estimate_throughput(
     context_length: int = 4096,
     is_prefill: bool = False
 ) -> dict:
-    """
-    Estimate tokens per second throughput.
-    Based on roofline model: throughput limited by compute or memory bandwidth.
-    Most LLM inference is memory-bound for single-batch decode.
-    """
-    # Rough estimate: 2 FLOPs per parameter per token (forward pass)
     flops_per_token = 2 * param_count
-    # Peak theoretical throughput (compute-bound)
     peak_tokens_per_sec = (gpu_tflops * 1e12) / flops_per_token
-    # Memory-bound estimate (more realistic for decode)
-    # Assume ~1TB/s memory bandwidth for modern GPUs
-    memory_bandwidth_tbs = 1.0  # TB/s, rough average
-    bytes_per_token = param_count * 2  # FP16 weights need to be read
     memory_bound_tokens = (memory_bandwidth_tbs * 1e12) / bytes_per_token
-    # Prefill is more compute-bound, decode is memory-bound
     if is_prefill:
         effective_tokens = min(peak_tokens_per_sec, memory_bound_tokens * 10) * batch_size
     else:
         effective_tokens = min(peak_tokens_per_sec, memory_bound_tokens) * batch_size
-    # Apply realistic efficiency factor (typically 30-60% of theoretical)
     efficiency = 0.4
     realistic_tokens = effective_tokens * efficiency
@@ -336,14 +278,9 @@ def estimate_throughput(
     }
-def calculate_cost_estimate(
-    vram_required: float,
-    hours_per_day: float = 8,
-    days_per_month: float = 22
-) -> list:
     """Calculate cost estimates for cloud GPUs that fit the model."""
     estimates = []
     for gpu_name, (vram, instance, category, hourly_cost, tflops) in GPU_SPECS.items():
         if vram >= vram_required and hourly_cost > 0:
             daily_cost = hourly_cost * hours_per_day
@@ -356,21 +293,19 @@ def calculate_cost_estimate(
                 "monthly": monthly_cost,
                 "instance": instance,
             })
     return sorted(estimates, key=lambda x: x["hourly"])
-def search_models(query: str, limit: int = 10) -> list:
     """Search HuggingFace models by name."""
     if not query or len(query) < 2:
         return []
     try:
         models = list(list_models(
             search=query,
             sort="downloads",
             direction=-1,
-            limit=limit,
             filter="text-generation"
         ))
         return [m.id for m in models]
@@ -378,27 +313,10 @@ def search_models(query: str, limit: int = 10) -> list:
         return []
-def calculate_flash_attention_savings(
-    kv_cache_bytes: int,
-    context_length: int
-) -> dict:
-    """
-    Estimate memory savings from Flash Attention.
-    Flash Attention uses tiling to reduce memory from O(n^2) to O(n).
-    """
-    # Standard attention materializes full attention matrix
-    # Flash Attention streams through, never materializing full matrix
-    # Savings primarily in activation memory, not KV cache
-    # KV cache itself is O(n), so Flash Attention doesn't reduce it
-    # But it dramatically reduces peak memory during computation
-    # Estimate: Flash Attention reduces peak memory by avoiding
-    # the O(n^2) attention matrix materialization
-    standard_attention_overhead = context_length * context_length * 2  # FP16
-    flash_attention_overhead = context_length * 128 * 2  # Block size overhead
     savings_bytes = standard_attention_overhead - flash_attention_overhead
     savings_ratio = 1 - (flash_attention_overhead / max(standard_attention_overhead, 1))
@@ -423,29 +341,24 @@ def calculate_vram(
     lora_rank: int = 16,
     show_throughput: bool = True,
     show_cost: bool = True
-) -> tuple[str, dict | None]:
-    """Main calculation function. Returns (markdown_results, chart_data)."""
-    # Validate inputs
     model_id = model_id.strip()
     if not model_id:
         raise gr.Error("Please enter a model ID")
     if "/" not in model_id:
         raise gr.Error("Model ID should be in format 'organization/model-name'")
-    # Fetch model info
     info = get_model_info(model_id)
     config = get_config(model_id)
     results = []
     results.append(f"## Model: [{model_id}](https://huggingface.co/{model_id})\n")
-    # Get parameter count and dtype
     param_count, dominant_dtype = estimate_params_from_safetensors(info)
     if param_count == 0:
-        results.append("⚠️ Could not determine parameter count from safetensors metadata.\n")
         results.append("Model may use pytorch_model.bin or other format.\n")
         return "\n".join(results), None
@@ -456,13 +369,11 @@ def calculate_vram(
     results.append(f"**Dominant dtype:** {dominant_dtype} ({dtype_bytes} bytes/param)")
     results.append(f"**Mode:** {mode}")
-    # Model weights VRAM
     weights_bytes = param_count * dtype_bytes
     weights_gb = bytes_to_gb(weights_bytes)
-    results.append(f"\n### 📦 Weight Memory")
     results.append(f"Model weights: **{weights_gb:.2f} GB**")
-    # Architecture details
     num_layers = config.get("num_hidden_layers", config.get("n_layer", 0))
     hidden_size = config.get("hidden_size", config.get("n_embd", 0))
     num_attention_heads = config.get("num_attention_heads", config.get("n_head", 0))
@@ -470,72 +381,61 @@ def calculate_vram(
     head_dim = get_head_dim(config)
     max_position = config.get("max_position_embeddings", config.get("n_positions", "N/A"))
-    results.append(f"\n### 🏗️ Architecture (from config.json)")
     if "_error" in config:
-        results.append(f"⚠️ Could not fetch config.json (model may be gated)")
         kv_gb = 0
     elif num_layers and hidden_size:
-        results.append(f"- **Layers:** {num_layers}")
-        results.append(f"- **Hidden size:** {hidden_size}")
-        results.append(f"- **Attention heads:** {num_attention_heads}")
-        results.append(f"- **KV heads:** {num_kv_heads} {'(GQA)' if num_kv_heads != num_attention_heads else '(MHA)'}")
-        results.append(f"- **Head dimension:** {head_dim}")
-        results.append(f"- **Max context:** {max_position:,}" if isinstance(max_position, int) else f"- **Max context:** {max_position}")
-        # KV Cache calculation
-        results.append(f"\n### 💾 KV Cache (batch_size={batch_size})")
         results.append("| Context | KV Cache | + Weights | Status |")
         results.append("|---------|----------|-----------|--------|")
-        # Show relevant context lengths
         context_points = [1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072]
         for ctx_len in context_points:
             if ctx_len > context_length * 2 and ctx_len > 8192:
                 break
-            kv_bytes = estimate_kv_cache_size(
-                num_layers, num_kv_heads, head_dim, ctx_len, batch_size, dtype_bytes
-            )
             kv_gb_temp = bytes_to_gb(kv_bytes)
             total_temp = weights_gb + kv_gb_temp
-            marker = " **← selected**" if ctx_len == context_length else ""
             results.append(f"| {ctx_len:,} | {kv_gb_temp:.2f} GB | {total_temp:.2f} GB |{marker} |")
-        # Calculate for selected context
-        kv_bytes = estimate_kv_cache_size(
-            num_layers, num_kv_heads, head_dim, context_length, batch_size, dtype_bytes
-        )
         kv_gb = bytes_to_gb(kv_bytes)
     else:
         results.append("Could not find architecture details")
         kv_gb = 0
-    # Flash Attention savings
     flash_savings = None
     if use_flash_attention and kv_gb > 0:
-        kv_bytes = estimate_kv_cache_size(
-            num_layers, num_kv_heads, head_dim, context_length, batch_size, dtype_bytes
-        )
         flash_savings = calculate_flash_attention_savings(kv_bytes, context_length)
-    # Calculate total based on mode
     if mode == "Training (Full)":
         training_mem = estimate_training_memory(param_count, dtype_bytes, optimizer)
         base_gb = bytes_to_gb(training_mem["total_base"])
-        # Activations estimation (rough: ~2x weights for typical batch)
         activation_gb = weights_gb * 2 * batch_size
         if use_flash_attention and flash_savings:
             activation_gb -= flash_savings["savings_gb"]
             activation_gb = max(0.1, activation_gb)
         total_gb = base_gb + kv_gb + activation_gb
-        results.append(f"\n### 🎓 Training Memory Breakdown")
-        results.append(f"- **Weights:** {weights_gb:.2f} GB")
-        results.append(f"- **Gradients:** {bytes_to_gb(training_mem['gradients']):.2f} GB")
-        results.append(f"- **Optimizer ({optimizer}):** {bytes_to_gb(training_mem['optimizer']):.2f} GB")
-        results.append(f"- **KV Cache:** {kv_gb:.2f} GB")
-        results.append(f"- **Activations (est.):** {activation_gb:.2f} GB")
         chart_data = {
             "Weights": weights_gb,
@@ -549,12 +449,12 @@ def calculate_vram(
         lora_mem = estimate_lora_memory(param_count, dtype_bytes, lora_rank, use_qlora=False)
         total_gb = bytes_to_gb(lora_mem["total"])
-        results.append(f"\n### 🔧 LoRA Fine-tuning (rank={lora_rank})")
-        results.append(f"- **Base weights (frozen):** {bytes_to_gb(lora_mem['base_weights']):.2f} GB")
-        results.append(f"- **LoRA adapters:** {bytes_to_gb(lora_mem['lora_weights']):.3f} GB ({lora_mem['lora_params']:,} params)")
-        results.append(f"- **Gradients (LoRA only):** {bytes_to_gb(lora_mem['gradients']):.3f} GB")
-        results.append(f"- **Optimizer states:** {bytes_to_gb(lora_mem['optimizer']):.3f} GB")
-        results.append(f"- **Activations:** {bytes_to_gb(lora_mem['activations']):.2f} GB")
         results.append(f"\n*Saves ~{(1-lora_mem['vs_full_finetune_ratio'])*100:.0f}% vs full fine-tuning*")
         chart_data = {
@@ -569,12 +469,12 @@ def calculate_vram(
         lora_mem = estimate_lora_memory(param_count, dtype_bytes, lora_rank, use_qlora=True)
         total_gb = bytes_to_gb(lora_mem["total"])
-        results.append(f"\n### 🔧 QLoRA Fine-tuning (4-bit base, rank={lora_rank})")
-        results.append(f"- **Base weights (4-bit):** {bytes_to_gb(lora_mem['base_weights']):.2f} GB")
-        results.append(f"- **LoRA adapters:** {bytes_to_gb(lora_mem['lora_weights']):.3f} GB ({lora_mem['lora_params']:,} params)")
-        results.append(f"- **Gradients (LoRA only):** {bytes_to_gb(lora_mem['gradients']):.3f} GB")
-        results.append(f"- **Optimizer states:** {bytes_to_gb(lora_mem['optimizer']):.3f} GB")
-        results.append(f"- **Activations:** {bytes_to_gb(lora_mem['activations']):.2f} GB")
         results.append(f"\n*Saves ~{(1-lora_mem['vs_full_finetune_ratio'])*100:.0f}% vs full fine-tuning*")
         chart_data = {
@@ -586,22 +486,18 @@ def calculate_vram(
         }
     else:
-        # Inference mode
         framework_overhead = SERVING_FRAMEWORKS.get(serving_framework, 1.15)
         base_total = weights_gb + kv_gb
         overhead_gb = base_total * (framework_overhead - 1)
-        # Flash Attention reduces activation memory overhead during inference
         if use_flash_attention and flash_savings:
             overhead_gb -= min(flash_savings["savings_gb"] * 0.1, overhead_gb * 0.5)
             overhead_gb = max(0, overhead_gb)
         total_gb = base_total + overhead_gb
-        results.append(f"\n### ⚡ Inference Memory ({serving_framework})")
-        results.append(f"- **Weights:** {weights_gb:.2f} GB")
-        results.append(f"- **KV Cache:** {kv_gb:.2f} GB")
-        results.append(f"- **Framework overhead:** {overhead_gb:.2f} GB ({(framework_overhead-1)*100:.0f}%)")
         chart_data = {
             "Weights": weights_gb,
@@ -609,100 +505,76 @@ def calculate_vram(
             "Overhead": overhead_gb,
         }
-    # Flash Attention info
     if use_flash_attention and flash_savings and flash_savings["savings_gb"] > 0.01:
-        results.append(f"\n### ⚡ Flash Attention")
-        results.append(f"- **Enabled:** Yes")
-        results.append(f"- **Peak memory savings:** ~{flash_savings['savings_gb']:.2f} GB ({flash_savings['savings_percent']:.1f}%)")
-    results.append(f"\n### 📊 Total VRAM Required: **{total_gb:.2f} GB**")
-    # Multi-GPU calculations
     if num_gpus > 1:
         multi_gpu = calculate_multi_gpu_split(total_gb, num_gpus, parallelism)
-        results.append(f"\n### 🔗 Multi-GPU ({num_gpus}x GPUs, {parallelism})")
-        results.append(f"- **Per GPU:** {multi_gpu['per_gpu']:.2f} GB")
-        results.append(f"- **Total across GPUs:** {multi_gpu['total']:.2f} GB")
-        results.append(f"- **Efficiency:** {multi_gpu['efficiency']}")
-        # Update total for GPU recommendations
         effective_vram_needed = multi_gpu['per_gpu']
     else:
         effective_vram_needed = total_gb
-    # GPU Recommendations
-    results.append(f"\n### 🎮 GPU Recommendations")
     results.append("| GPU | VRAM | Fits? | Headroom | Est. tok/s | Instance |")
     results.append("|-----|------|-------|----------|------------|----------|")
     for gpu_name, (vram, instance, category, hourly_cost, tflops) in GPU_SPECS.items():
-        fits = "✅" if vram >= effective_vram_needed else "❌"
         headroom = vram - effective_vram_needed
         headroom_str = f"+{headroom:.1f} GB" if headroom > 0 else f"{headroom:.1f} GB"
-        # Estimate throughput for this GPU
         if show_throughput and vram >= effective_vram_needed:
             throughput = estimate_throughput(param_count, tflops, batch_size, context_length)
             tok_str = f"~{throughput['estimated_tokens_per_sec']:.0f}"
         else:
             tok_str = "-"
         results.append(f"| {gpu_name} | {vram} GB | {fits} | {headroom_str} | {tok_str} | {instance} |")
-    # Quantization options (if model doesn't fit on consumer GPUs)
     if effective_vram_needed > 24:
-        results.append(f"\n### 🗜️ Quantization Options")
-        results.append("To fit on consumer GPUs (≤24 GB), consider these options:\n")
         results.append("| Method | Est. Size | Quality | Notes |")
         results.append("|--------|-----------|---------|-------|")
         for method, specs in QUANTIZATION_METHODS.items():
             quant_size = bytes_to_gb(param_count * specs["bytes_per_param"])
-            quant_with_overhead = quant_size * 1.1  # Small overhead
-            fits = "✅" if quant_with_overhead <= 24 else "❌"
-            results.append(f"| {method} | {quant_with_overhead:.1f} GB | {specs['quality']} | {fits} {specs['desc']} |")
-        results.append(f"\n**Tip:** Search for `{model_id.split('/')[-1]} GGUF` or `{model_id.split('/')[-1]} AWQ` on HuggingFace.")
-    # Cost estimates for cloud GPUs
     if show_cost:
         cost_estimates = calculate_cost_estimate(effective_vram_needed)
         if cost_estimates:
-            results.append(f"\n### 💰 Cloud Cost Estimates")
             results.append("*Based on 8 hrs/day, 22 days/month*\n")
             results.append("| GPU | Hourly | Daily | Monthly |")
             results.append("|-----|--------|-------|---------|")
-            for est in cost_estimates[:5]:  # Top 5 cheapest
                 results.append(f"| {est['gpu']} | ${est['hourly']:.2f} | ${est['daily']:.2f} | ${est['monthly']:.0f} |")
-    return "\n".join(results), chart_data
-def create_memory_chart(chart_data: dict | None):
-    """Create a bar chart for memory breakdown."""
-    if not chart_data:
-        return None
-    labels = list(chart_data.keys())
-    values = list(chart_data.values())
-    return gr.BarPlot(
-        value={"Component": labels, "GB": values},
-        x="Component",
-        y="GB",
-        title="Memory Breakdown",
-        height=300,
-        width=400,
-    )
-def compare_models(model_ids_text: str, context_length: int = 4096) -> str:
     """Compare multiple models side by side."""
     model_ids = [m.strip() for m in model_ids_text.split("\n") if m.strip()]
     if len(model_ids) < 2:
         return "Please enter at least 2 model IDs (one per line)"
     if len(model_ids) > 5:
         return "Maximum 5 models for comparison"
@@ -716,32 +588,23 @@ def compare_models(model_ids_text: str, context_length: int = 4096) -> str:
             param_count, dominant_dtype = estimate_params_from_safetensors(info)
             if param_count == 0:
-                comparison_data.append({
-                    "model": model_id,
-                    "params": "N/A",
-                    "error": "Could not determine parameters"
-                })
                 continue
             dtype_bytes = DTYPE_BYTES.get(dominant_dtype, 2)
             weights_gb = bytes_to_gb(param_count * dtype_bytes)
             num_layers = config.get("num_hidden_layers", config.get("n_layer", 0))
-            num_kv_heads = config.get("num_key_value_heads",
-                                       config.get("num_attention_heads", 0))
             head_dim = get_head_dim(config)
-            kv_bytes = estimate_kv_cache_size(
-                num_layers, num_kv_heads, head_dim, context_length, 1, dtype_bytes
-            )
             kv_gb = bytes_to_gb(kv_bytes)
             total_inference = weights_gb + kv_gb
-            # Training estimate
             training_mem = estimate_training_memory(param_count, dtype_bytes)
             training_gb = bytes_to_gb(training_mem["total_base"]) + weights_gb * 2
-            # QLoRA estimate
             qlora_mem = estimate_lora_memory(param_count, dtype_bytes, 16, use_qlora=True)
             qlora_gb = bytes_to_gb(qlora_mem["total"])
@@ -749,20 +612,13 @@ def compare_models(model_ids_text: str, context_length: int = 4096) -> str:
                 "model": model_id.split("/")[-1],
                 "full_id": model_id,
                 "params": f"{param_count/1e9:.1f}B",
-                "dtype": dominant_dtype,
-                "weights_gb": weights_gb,
-                "kv_gb": kv_gb,
                 "inference_gb": total_inference,
                 "training_gb": training_gb,
                 "qlora_gb": qlora_gb,
             })
         except Exception as e:
-            comparison_data.append({
-                "model": model_id,
-                "error": str(e)
-            })
-    # Build comparison table
     results.append(f"*Context length: {context_length:,}*\n")
     results.append("| Model | Params | Inference | Training | QLoRA |")
     results.append("|-------|--------|-----------|----------|-------|")
@@ -773,21 +629,16 @@ def compare_models(model_ids_text: str, context_length: int = 4096) -> str:
         else:
             results.append(
                 f"| [{data['model']}](https://huggingface.co/{data['full_id']}) | "
-                f"{data['params']} | "
-                f"{data['inference_gb']:.1f} GB | "
-                f"{data['training_gb']:.1f} GB | "
-                f"{data['qlora_gb']:.1f} GB |"
             )
-    # Find minimum for each category
     valid_data = [d for d in comparison_data if "error" not in d]
     if len(valid_data) >= 2:
         results.append("\n### Recommendations")
         min_inference = min(valid_data, key=lambda x: x["inference_gb"])
         min_training = min(valid_data, key=lambda x: x["training_gb"])
         min_qlora = min(valid_data, key=lambda x: x["qlora_gb"])
         results.append(f"- **Best for inference:** {min_inference['model']} ({min_inference['inference_gb']:.1f} GB)")
         results.append(f"- **Best for training:** {min_training['model']} ({min_training['training_gb']:.1f} GB)")
         results.append(f"- **Best for QLoRA:** {min_qlora['model']} ({min_qlora['qlora_gb']:.1f} GB)")
@@ -795,7 +646,7 @@ def compare_models(model_ids_text: str, context_length: int = 4096) -> str:
     return "\n".join(results)
-def export_results(result_text: str, format_type: str) -> str:
     """Export results to different formats."""
     if not result_text:
         return "No results to export. Run a calculation first."
@@ -803,15 +654,8 @@ def export_results(result_text: str, format_type: str) -> str:
     timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
     if format_type == "JSON":
-        # Parse markdown to create structured JSON
-        import re
         lines = result_text.split("\n")
-        data = {
-            "timestamp": timestamp,
-            "raw_markdown": result_text,
-            "sections": {}
-        }
         current_section = "header"
         for line in lines:
             if line.startswith("### "):
@@ -821,298 +665,222 @@ def export_results(result_text: str, format_type: str) -> str:
                 if current_section not in data["sections"]:
                     data["sections"][current_section] = []
                 data["sections"][current_section].append(line.strip())
         return json.dumps(data, indent=2)
-    else:  # Plain text
-        # Convert markdown to plain text
-        plain = result_text
-        plain = plain.replace("**", "")
-        plain = plain.replace("###", "\n===")
-        plain = plain.replace("##", "\n===")
-        plain = f"VRAM Calculator Export - {timestamp}\n{'='*50}\n\n{plain}"
-        return plain
-# Build Gradio interface
-with gr.Blocks(title="VRAM Calculator", theme=gr.themes.Soft()) as demo:
-    gr.Markdown("""
-    # VRAM & Instance Type Calculator
-    Estimate GPU memory requirements for HuggingFace models. Supports inference, training, LoRA/QLoRA fine-tuning,
-    multi-GPU setups, model comparison, and detailed quantization recommendations.
-    """)
-    with gr.Tabs():
-        # === CALCULATOR TAB ===
-        with gr.TabItem("Calculator"):
-            with gr.Row():
-                with gr.Column(scale=2):
-                    model_input = gr.Textbox(
-                        label="Model ID",
-                        placeholder="meta-llama/Llama-3.1-8B",
-                        info="Full HuggingFace model ID (org/model-name)"
-                    )
-                with gr.Column(scale=1):
-                    search_input = gr.Textbox(
-                        label="Search Models",
-                        placeholder="llama 8b",
-                        info="Search HuggingFace for models"
-                    )
-                    search_btn = gr.Button("Search", size="sm")
-            with gr.Row(visible=False) as search_results_row:
-                search_results = gr.Dropdown(
-                    label="Search Results (click to select)",
-                    choices=[],
-                    interactive=True,
-                )
-            def do_search(query):
-                if not query:
-                    return gr.update(visible=False), gr.update(choices=[])
-                results = search_models(query, limit=10)
-                if results:
-                    return gr.update(visible=True), gr.update(choices=results, value=results[0])
-                return gr.update(visible=True), gr.update(choices=["No models found"], value=None)
-            def select_model(selected):
-                if selected and selected != "No models found":
-                    return selected
-                return ""
-            search_btn.click(
-                fn=do_search,
-                inputs=[search_input],
-                outputs=[search_results_row, search_results]
             )
-            search_results.change(
-                fn=select_model,
-                inputs=[search_results],
-                outputs=[model_input]
             )
-            with gr.Row():
-                with gr.Column(scale=1):
-                    mode_input = gr.Radio(
-                        choices=["Inference", "Training (Full)", "LoRA Fine-tuning", "QLoRA Fine-tuning"],
-                        value="Inference",
-                        label="Mode",
-                        info="LoRA/QLoRA use significantly less memory"
-                    )
-                with gr.Column(scale=1):
-                    context_input = gr.Slider(
-                        label="Context Length",
-                        minimum=512,
-                        maximum=131072,
-                        value=4096,
-                        step=512,
-                        info="Sequence length for KV cache"
-                    )
-                with gr.Column(scale=1):
-                    batch_input = gr.Slider(
-                        label="Batch Size",
-                        minimum=1,
-                        maximum=64,
-                        value=1,
-                        step=1,
-                        info="Concurrent sequences"
-                    )
-            with gr.Accordion("Advanced Options", open=False):
-                with gr.Row():
-                    with gr.Column():
-                        serving_input = gr.Dropdown(
-                            choices=list(SERVING_FRAMEWORKS.keys()),
-                            value="None (raw PyTorch)",
-                            label="Serving Framework",
-                            info="Different frameworks have different overhead"
-                        )
-                        optimizer_input = gr.Dropdown(
-                            choices=["AdamW", "SGD", "SGD + Momentum", "8-bit Adam"],
-                            value="AdamW",
-                            label="Optimizer (Training mode)",
-                            info="Optimizer state memory varies"
-                        )
-                        lora_rank_input = gr.Slider(
-                            label="LoRA Rank",
-                            minimum=4,
-                            maximum=128,
-                            value=16,
-                            step=4,
-                            info="Higher rank = more capacity but more memory"
-                        )
-                    with gr.Column():
-                        num_gpus_input = gr.Slider(
-                            label="Number of GPUs",
-                            minimum=1,
-                            maximum=8,
-                            value=1,
-                            step=1,
-                            info="For multi-GPU setups"
-                        )
-                        parallelism_input = gr.Dropdown(
-                            choices=["Tensor Parallelism", "Pipeline Parallelism", "Data Parallelism"],
-                            value="Tensor Parallelism",
-                            label="Parallelism Strategy",
-                            info="How to distribute across GPUs"
-                        )
-                        flash_attention_input = gr.Checkbox(
-                            label="Use Flash Attention",
-                            value=True,
-                            info="Reduces peak memory usage"
-                        )
-                with gr.Row():
-                    show_throughput_input = gr.Checkbox(
-                        label="Show Throughput Estimates",
-                        value=True,
-                        info="Estimated tokens/sec per GPU"
-                    )
-                    show_cost_input = gr.Checkbox(
-                        label="Show Cost Estimates",
-                        value=True,
-                        info="Cloud GPU hourly/monthly costs"
-                    )
-            calculate_btn = gr.Button("Calculate VRAM", variant="primary", size="lg")
-            with gr.Row():
-                with gr.Column(scale=3):
-                    output = gr.Markdown(label="Results")
-                with gr.Column(scale=1):
-                    chart_output = gr.BarPlot(
-                        x="Component",
-                        y="GB",
-                        title="Memory Breakdown",
-                        height=350,
-                    )
-            def run_calculation(
-                model_id, context_length, batch_size, mode, optimizer, serving,
-                num_gpus, parallelism, flash_attention, lora_rank, show_throughput, show_cost
-            ):
-                result_text, chart_data = calculate_vram(
-                    model_id, context_length, batch_size, mode, optimizer, serving,
-                    num_gpus, parallelism, flash_attention, lora_rank, show_throughput, show_cost
-                )
-                if chart_data:
-                    import pandas as pd
-                    df = pd.DataFrame({
-                        "Component": list(chart_data.keys()),
-                        "GB": list(chart_data.values())
-                    })
-                    return result_text, df
-                return result_text, None
-            calculate_btn.click(
-                fn=run_calculation,
-                inputs=[
-                    model_input, context_input, batch_input, mode_input,
-                    optimizer_input, serving_input, num_gpus_input, parallelism_input,
-                    flash_attention_input, lora_rank_input, show_throughput_input, show_cost_input
-                ],
-                outputs=[output, chart_output]
             )
-            # Examples
-            gr.Examples(
-                examples=[
-                    ["meta-llama/Llama-3.1-8B", 4096, 1],
-                    ["meta-llama/Llama-3.1-70B", 8192, 1],
-                    ["mistralai/Mistral-7B-v0.1", 8192, 1],
-                    ["Qwen/Qwen2.5-72B", 32768, 1],
-                    ["google/gemma-2-27b", 8192, 1],
-                    ["microsoft/phi-4", 16384, 1],
-                    ["deepseek-ai/DeepSeek-V3", 4096, 1],
-                    ["meta-llama/Llama-3.3-70B-Instruct", 8192, 1],
-                ],
-                inputs=[model_input, context_input, batch_input],
-                label="Popular Models"
-            )
-        # === COMPARE TAB ===
-        with gr.TabItem("Compare Models"):
-            gr.Markdown("""
-            Compare VRAM requirements across multiple models side-by-side.
-            Enter model IDs one per line (2-5 models).
-            """)
-            compare_models_input = gr.Textbox(
-                label="Model IDs (one per line)",
-                placeholder="meta-llama/Llama-3.1-8B\nmistralai/Mistral-7B-v0.1\nQwen/Qwen2.5-7B",
-                lines=5,
             )
-            compare_context_input = gr.Slider(
                 label="Context Length",
                 minimum=512,
                 maximum=131072,
                 value=4096,
-                step=512,
             )
-            compare_btn = gr.Button("Compare Models", variant="primary")
-            compare_output = gr.Markdown(label="Comparison Results")
-            compare_btn.click(
-                fn=compare_models,
-                inputs=[compare_models_input, compare_context_input],
-                outputs=compare_output
             )
-            gr.Examples(
-                examples=[
-                    ["meta-llama/Llama-3.1-8B\nmistralai/Mistral-7B-v0.1\nQwen/Qwen2.5-7B", 4096],
-                    ["meta-llama/Llama-3.1-70B\nQwen/Qwen2.5-72B\nmeta-llama/Llama-3.3-70B-Instruct", 8192],
-                ],
-                inputs=[compare_models_input, compare_context_input],
-                label="Example Comparisons"
-            )
-        # === EXPORT TAB ===
-        with gr.TabItem("Export"):
-            gr.Markdown("""
-            Export your calculation results to JSON or plain text format.
-            First run a calculation in the Calculator tab, then copy the results here.
-            """)
-            export_input = gr.Textbox(
-                label="Paste Results Here",
-                placeholder="Paste the calculation results from the Calculator tab...",
-                lines=10,
-            )
-            export_format = gr.Radio(
-                choices=["JSON", "Plain Text"],
-                value="JSON",
-                label="Export Format"
-            )
-            export_btn = gr.Button("Export", variant="primary")
-            export_output = gr.Textbox(
-                label="Exported Data",
-                lines=15,
-                show_copy_button=True,
-            )
-            export_btn.click(
-                fn=export_results,
-                inputs=[export_input, export_format],
-                outputs=export_output
             )
-    # Notes outside tabs
     gr.Markdown("""
     ---
-    ### Notes
-    - **Inference mode:** Weights + KV cache + framework overhead
-    - **Training modes:** Full training, LoRA, and QLoRA with different memory profiles
-    - **KV cache:** Scales linearly with context length and batch size
-    - **Multi-GPU:** Tensor parallelism splits memory; data parallelism replicates it
-    - **Quantization:** GGUF/AWQ/GPTQ can reduce memory 2-8x with minimal quality loss
-    ### Disclaimers
-    - Estimates are approximate; actual usage varies by implementation
-    - Flash Attention and other optimizations can reduce peak memory
-    - Throughput estimates assume ideal conditions
-    Built with Gradio & HuggingFace Hub API
     """)

 import gradio as gr
 from huggingface_hub import HfApi, hf_hub_download, list_models
 import json
+import pandas as pd
 from functools import lru_cache
 from datetime import datetime
 def get_head_dim(config: dict) -> int:
     """Calculate head dimension from config, with fallbacks."""
     if "head_dim" in config:
         return config["head_dim"]
     hidden_size = config.get("hidden_size", config.get("n_embd", 0))
     num_heads = config.get("num_attention_heads", config.get("n_head", 0))
     if hidden_size and num_heads:
         return hidden_size // num_heads
+    return 128
 def estimate_kv_cache_size(
     batch_size: int = 1,
     dtype_bytes: int = 2
 ) -> int:
+    """KV cache size = 2 * num_layers * batch_size * context_length * num_kv_heads * head_dim * dtype_bytes"""
+    return 2 * num_layers * batch_size * context_length * num_kv_heads * head_dim * dtype_bytes
+def estimate_training_memory(param_count: int, dtype_bytes: int, optimizer: str = "AdamW") -> dict:
+    """Estimate training memory requirements."""
     weights_bytes = param_count * dtype_bytes
     gradients_bytes = param_count * dtype_bytes
     if optimizer == "AdamW":
+        optimizer_bytes = param_count * 4 * 2
     elif optimizer == "SGD":
+        optimizer_bytes = 0
     elif optimizer == "SGD + Momentum":
+        optimizer_bytes = param_count * 4
     elif optimizer == "8-bit Adam":
+        optimizer_bytes = param_count * 1 * 2
     else:
+        optimizer_bytes = param_count * 4 * 2
     return {
         "weights": weights_bytes,
 def calculate_multi_gpu_split(total_vram_gb: float, num_gpus: int, parallelism: str) -> dict:
     """Calculate memory distribution across multiple GPUs."""
     if parallelism == "Tensor Parallelism":
         per_gpu = total_vram_gb / num_gpus
+        overhead = 0.05 * total_vram_gb
         return {
             "per_gpu": per_gpu + (overhead / num_gpus),
             "total": total_vram_gb + overhead,
             "efficiency": "High (best for inference)",
         }
     elif parallelism == "Pipeline Parallelism":
         per_gpu = total_vram_gb / num_gpus
+        overhead = 0.1 * total_vram_gb
         return {
             "per_gpu": per_gpu + (overhead / num_gpus),
             "total": total_vram_gb + overhead,
             "efficiency": "Medium (good for training)",
         }
+    else:
         return {
             "per_gpu": total_vram_gb,
             "total": total_vram_gb * num_gpus,
     target_modules: int = 4,
     use_qlora: bool = False
 ) -> dict:
+    """Estimate LoRA/QLoRA fine-tuning memory requirements."""
     if use_qlora:
+        base_weights_bytes = param_count * 0.5
     else:
         base_weights_bytes = param_count * dtype_bytes
+    lora_params_ratio = (lora_rank * 2 * target_modules) / 1000
+    lora_params = int(param_count * lora_params_ratio * 0.01)
     lora_weights_bytes = lora_params * dtype_bytes
     gradients_bytes = lora_params * dtype_bytes
+    optimizer_bytes = lora_params * 4 * 2
+    activation_bytes = base_weights_bytes * 0.5
     return {
         "base_weights": base_weights_bytes,
         "optimizer": optimizer_bytes,
         "activations": activation_bytes,
         "total": base_weights_bytes + lora_weights_bytes + gradients_bytes + optimizer_bytes + activation_bytes,
+        "vs_full_finetune_ratio": 0.3 if use_qlora else 0.5,
     }
     context_length: int = 4096,
     is_prefill: bool = False
 ) -> dict:
+    """Estimate tokens per second throughput."""
     flops_per_token = 2 * param_count
     peak_tokens_per_sec = (gpu_tflops * 1e12) / flops_per_token
+    memory_bandwidth_tbs = 1.0
+    bytes_per_token = param_count * 2
     memory_bound_tokens = (memory_bandwidth_tbs * 1e12) / bytes_per_token
     if is_prefill:
         effective_tokens = min(peak_tokens_per_sec, memory_bound_tokens * 10) * batch_size
     else:
         effective_tokens = min(peak_tokens_per_sec, memory_bound_tokens) * batch_size
     efficiency = 0.4
     realistic_tokens = effective_tokens * efficiency
     }
+def calculate_cost_estimate(vram_required: float, hours_per_day: float = 8, days_per_month: float = 22) -> list:
     """Calculate cost estimates for cloud GPUs that fit the model."""
     estimates = []
     for gpu_name, (vram, instance, category, hourly_cost, tflops) in GPU_SPECS.items():
         if vram >= vram_required and hourly_cost > 0:
             daily_cost = hourly_cost * hours_per_day
                 "monthly": monthly_cost,
                 "instance": instance,
             })
     return sorted(estimates, key=lambda x: x["hourly"])
+def search_models_fn(query: str) -> list:
     """Search HuggingFace models by name."""
     if not query or len(query) < 2:
         return []
     try:
         models = list(list_models(
             search=query,
             sort="downloads",
             direction=-1,
+            limit=10,
             filter="text-generation"
         ))
         return [m.id for m in models]
         return []
+def calculate_flash_attention_savings(kv_cache_bytes: int, context_length: int) -> dict:
+    """Estimate memory savings from Flash Attention."""
+    standard_attention_overhead = context_length * context_length * 2
+    flash_attention_overhead = context_length * 128 * 2
     savings_bytes = standard_attention_overhead - flash_attention_overhead
     savings_ratio = 1 - (flash_attention_overhead / max(standard_attention_overhead, 1))
     lora_rank: int = 16,
     show_throughput: bool = True,
     show_cost: bool = True
+):
+    """Main calculation function. Returns (markdown_results, chart_dataframe)."""
     model_id = model_id.strip()
     if not model_id:
         raise gr.Error("Please enter a model ID")
     if "/" not in model_id:
         raise gr.Error("Model ID should be in format 'organization/model-name'")
     info = get_model_info(model_id)
     config = get_config(model_id)
     results = []
     results.append(f"## Model: [{model_id}](https://huggingface.co/{model_id})\n")
     param_count, dominant_dtype = estimate_params_from_safetensors(info)
     if param_count == 0:
+        results.append("Could not determine parameter count from safetensors metadata.\n")
         results.append("Model may use pytorch_model.bin or other format.\n")
         return "\n".join(results), None
     results.append(f"**Dominant dtype:** {dominant_dtype} ({dtype_bytes} bytes/param)")
     results.append(f"**Mode:** {mode}")
     weights_bytes = param_count * dtype_bytes
     weights_gb = bytes_to_gb(weights_bytes)
+    results.append(f"\n### Weight Memory")
     results.append(f"Model weights: **{weights_gb:.2f} GB**")
     num_layers = config.get("num_hidden_layers", config.get("n_layer", 0))
     hidden_size = config.get("hidden_size", config.get("n_embd", 0))
     num_attention_heads = config.get("num_attention_heads", config.get("n_head", 0))
     head_dim = get_head_dim(config)
     max_position = config.get("max_position_embeddings", config.get("n_positions", "N/A"))
+    results.append(f"\n### Architecture")
     if "_error" in config:
+        results.append(f"Could not fetch config.json (model may be gated)")
         kv_gb = 0
     elif num_layers and hidden_size:
+        results.append(f"- Layers: {num_layers}")
+        results.append(f"- Hidden size: {hidden_size}")
+        results.append(f"- Attention heads: {num_attention_heads}")
+        results.append(f"- KV heads: {num_kv_heads} {'(GQA)' if num_kv_heads != num_attention_heads else '(MHA)'}")
+        results.append(f"- Head dimension: {head_dim}")
+        if isinstance(max_position, int):
+            results.append(f"- Max context: {max_position:,}")
+        else:
+            results.append(f"- Max context: {max_position}")
+        results.append(f"\n### KV Cache (batch_size={batch_size})")
         results.append("| Context | KV Cache | + Weights | Status |")
         results.append("|---------|----------|-----------|--------|")
         context_points = [1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072]
         for ctx_len in context_points:
             if ctx_len > context_length * 2 and ctx_len > 8192:
                 break
+            kv_bytes = estimate_kv_cache_size(num_layers, num_kv_heads, head_dim, ctx_len, batch_size, dtype_bytes)
             kv_gb_temp = bytes_to_gb(kv_bytes)
             total_temp = weights_gb + kv_gb_temp
+            marker = " **<- selected**" if ctx_len == context_length else ""
             results.append(f"| {ctx_len:,} | {kv_gb_temp:.2f} GB | {total_temp:.2f} GB |{marker} |")
+        kv_bytes = estimate_kv_cache_size(num_layers, num_kv_heads, head_dim, context_length, batch_size, dtype_bytes)
         kv_gb = bytes_to_gb(kv_bytes)
     else:
         results.append("Could not find architecture details")
         kv_gb = 0
     flash_savings = None
     if use_flash_attention and kv_gb > 0:
+        kv_bytes = estimate_kv_cache_size(num_layers, num_kv_heads, head_dim, context_length, batch_size, dtype_bytes)
         flash_savings = calculate_flash_attention_savings(kv_bytes, context_length)
     if mode == "Training (Full)":
         training_mem = estimate_training_memory(param_count, dtype_bytes, optimizer)
         base_gb = bytes_to_gb(training_mem["total_base"])
         activation_gb = weights_gb * 2 * batch_size
         if use_flash_attention and flash_savings:
             activation_gb -= flash_savings["savings_gb"]
             activation_gb = max(0.1, activation_gb)
         total_gb = base_gb + kv_gb + activation_gb
+        results.append(f"\n### Training Memory Breakdown")
+        results.append(f"- Weights: {weights_gb:.2f} GB")
+        results.append(f"- Gradients: {bytes_to_gb(training_mem['gradients']):.2f} GB")
+        results.append(f"- Optimizer ({optimizer}): {bytes_to_gb(training_mem['optimizer']):.2f} GB")
+        results.append(f"- KV Cache: {kv_gb:.2f} GB")
+        results.append(f"- Activations (est.): {activation_gb:.2f} GB")
         chart_data = {
             "Weights": weights_gb,
         lora_mem = estimate_lora_memory(param_count, dtype_bytes, lora_rank, use_qlora=False)
         total_gb = bytes_to_gb(lora_mem["total"])
+        results.append(f"\n### LoRA Fine-tuning (rank={lora_rank})")
+        results.append(f"- Base weights (frozen): {bytes_to_gb(lora_mem['base_weights']):.2f} GB")
+        results.append(f"- LoRA adapters: {bytes_to_gb(lora_mem['lora_weights']):.3f} GB ({lora_mem['lora_params']:,} params)")
+        results.append(f"- Gradients (LoRA only): {bytes_to_gb(lora_mem['gradients']):.3f} GB")
+        results.append(f"- Optimizer states: {bytes_to_gb(lora_mem['optimizer']):.3f} GB")
+        results.append(f"- Activations: {bytes_to_gb(lora_mem['activations']):.2f} GB")
         results.append(f"\n*Saves ~{(1-lora_mem['vs_full_finetune_ratio'])*100:.0f}% vs full fine-tuning*")
         chart_data = {
         lora_mem = estimate_lora_memory(param_count, dtype_bytes, lora_rank, use_qlora=True)
         total_gb = bytes_to_gb(lora_mem["total"])
+        results.append(f"\n### QLoRA Fine-tuning (4-bit base, rank={lora_rank})")
+        results.append(f"- Base weights (4-bit): {bytes_to_gb(lora_mem['base_weights']):.2f} GB")
+        results.append(f"- LoRA adapters: {bytes_to_gb(lora_mem['lora_weights']):.3f} GB ({lora_mem['lora_params']:,} params)")
+        results.append(f"- Gradients (LoRA only): {bytes_to_gb(lora_mem['gradients']):.3f} GB")
+        results.append(f"- Optimizer states: {bytes_to_gb(lora_mem['optimizer']):.3f} GB")
+        results.append(f"- Activations: {bytes_to_gb(lora_mem['activations']):.2f} GB")
         results.append(f"\n*Saves ~{(1-lora_mem['vs_full_finetune_ratio'])*100:.0f}% vs full fine-tuning*")
         chart_data = {
         }
     else:
         framework_overhead = SERVING_FRAMEWORKS.get(serving_framework, 1.15)
         base_total = weights_gb + kv_gb
         overhead_gb = base_total * (framework_overhead - 1)
         if use_flash_attention and flash_savings:
             overhead_gb -= min(flash_savings["savings_gb"] * 0.1, overhead_gb * 0.5)
             overhead_gb = max(0, overhead_gb)
         total_gb = base_total + overhead_gb
+        results.append(f"\n### Inference Memory ({serving_framework})")
+        results.append(f"- Weights: {weights_gb:.2f} GB")
+        results.append(f"- KV Cache: {kv_gb:.2f} GB")
+        results.append(f"- Framework overhead: {overhead_gb:.2f} GB ({(framework_overhead-1)*100:.0f}%)")
         chart_data = {
             "Weights": weights_gb,
             "Overhead": overhead_gb,
         }
     if use_flash_attention and flash_savings and flash_savings["savings_gb"] > 0.01:
+        results.append(f"\n### Flash Attention")
+        results.append(f"- Enabled: Yes")
+        results.append(f"- Peak memory savings: ~{flash_savings['savings_gb']:.2f} GB ({flash_savings['savings_percent']:.1f}%)")
+    results.append(f"\n### Total VRAM Required: **{total_gb:.2f} GB**")
     if num_gpus > 1:
         multi_gpu = calculate_multi_gpu_split(total_gb, num_gpus, parallelism)
+        results.append(f"\n### Multi-GPU ({num_gpus}x GPUs, {parallelism})")
+        results.append(f"- Per GPU: {multi_gpu['per_gpu']:.2f} GB")
+        results.append(f"- Total across GPUs: {multi_gpu['total']:.2f} GB")
+        results.append(f"- Efficiency: {multi_gpu['efficiency']}")
         effective_vram_needed = multi_gpu['per_gpu']
     else:
         effective_vram_needed = total_gb
+    results.append(f"\n### GPU Recommendations")
     results.append("| GPU | VRAM | Fits? | Headroom | Est. tok/s | Instance |")
     results.append("|-----|------|-------|----------|------------|----------|")
     for gpu_name, (vram, instance, category, hourly_cost, tflops) in GPU_SPECS.items():
+        fits = "Yes" if vram >= effective_vram_needed else "No"
         headroom = vram - effective_vram_needed
         headroom_str = f"+{headroom:.1f} GB" if headroom > 0 else f"{headroom:.1f} GB"
         if show_throughput and vram >= effective_vram_needed:
             throughput = estimate_throughput(param_count, tflops, batch_size, context_length)
             tok_str = f"~{throughput['estimated_tokens_per_sec']:.0f}"
         else:
             tok_str = "-"
         results.append(f"| {gpu_name} | {vram} GB | {fits} | {headroom_str} | {tok_str} | {instance} |")
     if effective_vram_needed > 24:
+        results.append(f"\n### Quantization Options")
+        results.append("To fit on consumer GPUs (24 GB or less), consider:\n")
         results.append("| Method | Est. Size | Quality | Notes |")
         results.append("|--------|-----------|---------|-------|")
         for method, specs in QUANTIZATION_METHODS.items():
             quant_size = bytes_to_gb(param_count * specs["bytes_per_param"])
+            quant_with_overhead = quant_size * 1.1
+            fits = "Yes" if quant_with_overhead <= 24 else "No"
+            results.append(f"| {method} | {quant_with_overhead:.1f} GB | {specs['quality']} | {fits} - {specs['desc']} |")
+        model_name = model_id.split('/')[-1]
+        results.append(f"\n**Tip:** Search for `{model_name} GGUF` or `{model_name} AWQ` on HuggingFace.")
     if show_cost:
         cost_estimates = calculate_cost_estimate(effective_vram_needed)
         if cost_estimates:
+            results.append(f"\n### Cloud Cost Estimates")
             results.append("*Based on 8 hrs/day, 22 days/month*\n")
             results.append("| GPU | Hourly | Daily | Monthly |")
             results.append("|-----|--------|-------|---------|")
+            for est in cost_estimates[:5]:
                 results.append(f"| {est['gpu']} | ${est['hourly']:.2f} | ${est['daily']:.2f} | ${est['monthly']:.0f} |")
+    # Create DataFrame for chart
+    df = pd.DataFrame({
+        "Component": list(chart_data.keys()),
+        "GB": list(chart_data.values())
+    })
+    return "\n".join(results), df
+def compare_models_fn(model_ids_text: str, context_length: int = 4096) -> str:
     """Compare multiple models side by side."""
     model_ids = [m.strip() for m in model_ids_text.split("\n") if m.strip()]
     if len(model_ids) < 2:
         return "Please enter at least 2 model IDs (one per line)"
     if len(model_ids) > 5:
         return "Maximum 5 models for comparison"
             param_count, dominant_dtype = estimate_params_from_safetensors(info)
             if param_count == 0:
+                comparison_data.append({"model": model_id, "error": "Could not determine parameters"})
                 continue
             dtype_bytes = DTYPE_BYTES.get(dominant_dtype, 2)
             weights_gb = bytes_to_gb(param_count * dtype_bytes)
             num_layers = config.get("num_hidden_layers", config.get("n_layer", 0))
+            num_kv_heads = config.get("num_key_value_heads", config.get("num_attention_heads", 0))
             head_dim = get_head_dim(config)
+            kv_bytes = estimate_kv_cache_size(num_layers, num_kv_heads, head_dim, context_length, 1, dtype_bytes)
             kv_gb = bytes_to_gb(kv_bytes)
             total_inference = weights_gb + kv_gb
             training_mem = estimate_training_memory(param_count, dtype_bytes)
             training_gb = bytes_to_gb(training_mem["total_base"]) + weights_gb * 2
             qlora_mem = estimate_lora_memory(param_count, dtype_bytes, 16, use_qlora=True)
             qlora_gb = bytes_to_gb(qlora_mem["total"])
                 "model": model_id.split("/")[-1],
                 "full_id": model_id,
                 "params": f"{param_count/1e9:.1f}B",
                 "inference_gb": total_inference,
                 "training_gb": training_gb,
                 "qlora_gb": qlora_gb,
             })
         except Exception as e:
+            comparison_data.append({"model": model_id, "error": str(e)})
     results.append(f"*Context length: {context_length:,}*\n")
     results.append("| Model | Params | Inference | Training | QLoRA |")
     results.append("|-------|--------|-----------|----------|-------|")
         else:
             results.append(
                 f"| [{data['model']}](https://huggingface.co/{data['full_id']}) | "
+                f"{data['params']} | {data['inference_gb']:.1f} GB | "
+                f"{data['training_gb']:.1f} GB | {data['qlora_gb']:.1f} GB |"
             )
     valid_data = [d for d in comparison_data if "error" not in d]
     if len(valid_data) >= 2:
         results.append("\n### Recommendations")
         min_inference = min(valid_data, key=lambda x: x["inference_gb"])
         min_training = min(valid_data, key=lambda x: x["training_gb"])
         min_qlora = min(valid_data, key=lambda x: x["qlora_gb"])
         results.append(f"- **Best for inference:** {min_inference['model']} ({min_inference['inference_gb']:.1f} GB)")
         results.append(f"- **Best for training:** {min_training['model']} ({min_training['training_gb']:.1f} GB)")
         results.append(f"- **Best for QLoRA:** {min_qlora['model']} ({min_qlora['qlora_gb']:.1f} GB)")
     return "\n".join(results)
+def export_results_fn(result_text: str, format_type: str) -> str:
     """Export results to different formats."""
     if not result_text:
         return "No results to export. Run a calculation first."
     timestamp = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
     if format_type == "JSON":
         lines = result_text.split("\n")
+        data = {"timestamp": timestamp, "raw_markdown": result_text, "sections": {}}
         current_section = "header"
         for line in lines:
             if line.startswith("### "):
                 if current_section not in data["sections"]:
                     data["sections"][current_section] = []
                 data["sections"][current_section].append(line.strip())
         return json.dumps(data, indent=2)
+    else:
+        plain = result_text.replace("**", "").replace("###", "\n===").replace("##", "\n===")
+        return f"VRAM Calculator Export - {timestamp}\n{'='*50}\n\n{plain}"
+def do_search(query: str):
+    """Search for models and return dropdown choices."""
+    if not query:
+        return gr.update(choices=[], value=None)
+    results = search_models_fn(query)
+    if results:
+        return gr.update(choices=results, value=results[0])
+    return gr.update(choices=["No models found"], value=None)
+def select_from_search(selected: str) -> str:
+    """Select a model from search results."""
+    if selected and selected != "No models found":
+        return selected
+    return ""
+# Build Gradio interface
+with gr.Blocks(title="VRAM Calculator", theme=gr.themes.Soft()) as demo:
+    gr.Markdown("# VRAM & Instance Type Calculator")
+    gr.Markdown("Estimate GPU memory requirements for HuggingFace models.")
+    with gr.Tab("Calculator"):
+        with gr.Row():
+            model_input = gr.Textbox(
+                label="Model ID",
+                placeholder="meta-llama/Llama-3.1-8B",
+                info="Full HuggingFace model ID (org/model-name)",
+                scale=2
             )
+            search_input = gr.Textbox(
+                label="Search Models",
+                placeholder="llama 8b",
+                info="Search HuggingFace",
+                scale=1
             )
+        with gr.Row():
+            search_btn = gr.Button("Search HuggingFace", scale=1)
+            search_results = gr.Dropdown(
+                label="Search Results",
+                choices=[],
+                interactive=True,
+                scale=2
             )
+        search_btn.click(fn=do_search, inputs=[search_input], outputs=[search_results])
+        search_results.change(fn=select_from_search, inputs=[search_results], outputs=[model_input])
+        with gr.Row():
+            mode_input = gr.Radio(
+                choices=["Inference", "Training (Full)", "LoRA Fine-tuning", "QLoRA Fine-tuning"],
+                value="Inference",
+                label="Mode"
             )
+            context_input = gr.Slider(
                 label="Context Length",
                 minimum=512,
                 maximum=131072,
                 value=4096,
+                step=512
             )
+            batch_input = gr.Slider(
+                label="Batch Size",
+                minimum=1,
+                maximum=64,
+                value=1,
+                step=1
             )
+        with gr.Accordion("Advanced Options", open=False):
+            with gr.Row():
+                serving_input = gr.Dropdown(
+                    choices=list(SERVING_FRAMEWORKS.keys()),
+                    value="None (raw PyTorch)",
+                    label="Serving Framework"
+                )
+                optimizer_input = gr.Dropdown(
+                    choices=["AdamW", "SGD", "SGD + Momentum", "8-bit Adam"],
+                    value="AdamW",
+                    label="Optimizer (Training mode)"
+                )
+                lora_rank_input = gr.Slider(
+                    label="LoRA Rank",
+                    minimum=4,
+                    maximum=128,
+                    value=16,
+                    step=4
+                )
+            with gr.Row():
+                num_gpus_input = gr.Slider(
+                    label="Number of GPUs",
+                    minimum=1,
+                    maximum=8,
+                    value=1,
+                    step=1
+                )
+                parallelism_input = gr.Dropdown(
+                    choices=["Tensor Parallelism", "Pipeline Parallelism", "Data Parallelism"],
+                    value="Tensor Parallelism",
+                    label="Parallelism Strategy"
+                )
+                flash_attention_input = gr.Checkbox(
+                    label="Use Flash Attention",
+                    value=True
+                )
+            with gr.Row():
+                show_throughput_input = gr.Checkbox(label="Show Throughput Estimates", value=True)
+                show_cost_input = gr.Checkbox(label="Show Cost Estimates", value=True)
+        calculate_btn = gr.Button("Calculate VRAM", variant="primary", size="lg")
+        with gr.Row():
+            output = gr.Markdown(label="Results")
+            chart_output = gr.BarPlot(
+                x="Component",
+                y="GB",
+                title="Memory Breakdown",
+                height=350,
+                width=400
             )
+        calculate_btn.click(
+            fn=calculate_vram,
+            inputs=[
+                model_input, context_input, batch_input, mode_input,
+                optimizer_input, serving_input, num_gpus_input, parallelism_input,
+                flash_attention_input, lora_rank_input, show_throughput_input, show_cost_input
+            ],
+            outputs=[output, chart_output]
+        )
+        gr.Markdown("### Popular Models")
+        gr.Examples(
+            examples=[
+                ["meta-llama/Llama-3.1-8B", 4096, 1],
+                ["meta-llama/Llama-3.1-70B", 8192, 1],
+                ["mistralai/Mistral-7B-v0.1", 8192, 1],
+                ["Qwen/Qwen2.5-72B", 32768, 1],
+                ["google/gemma-2-27b", 8192, 1],
+                ["microsoft/phi-4", 16384, 1],
+            ],
+            inputs=[model_input, context_input, batch_input],
+        )
+    with gr.Tab("Compare Models"):
+        gr.Markdown("Compare VRAM requirements across multiple models. Enter model IDs one per line (2-5 models).")
+        compare_models_input = gr.Textbox(
+            label="Model IDs (one per line)",
+            placeholder="meta-llama/Llama-3.1-8B\nmistralai/Mistral-7B-v0.1\nQwen/Qwen2.5-7B",
+            lines=5,
+        )
+        compare_context_input = gr.Slider(
+            label="Context Length",
+            minimum=512,
+            maximum=131072,
+            value=4096,
+            step=512,
+        )
+        compare_btn = gr.Button("Compare Models", variant="primary")
+        compare_output = gr.Markdown(label="Comparison Results")
+        compare_btn.click(
+            fn=compare_models_fn,
+            inputs=[compare_models_input, compare_context_input],
+            outputs=[compare_output]
+        )
+        gr.Markdown("### Example Comparisons")
+        gr.Examples(
+            examples=[
+                ["meta-llama/Llama-3.1-8B\nmistralai/Mistral-7B-v0.1\nQwen/Qwen2.5-7B", 4096],
+                ["meta-llama/Llama-3.1-70B\nQwen/Qwen2.5-72B", 8192],
+            ],
+            inputs=[compare_models_input, compare_context_input],
+        )
+    with gr.Tab("Export"):
+        gr.Markdown("Export calculation results to JSON or plain text. Copy results from Calculator tab.")
+        export_input = gr.Textbox(
+            label="Paste Results Here",
+            placeholder="Paste the calculation results...",
+            lines=10,
+        )
+        export_format = gr.Radio(
+            choices=["JSON", "Plain Text"],
+            value="JSON",
+            label="Export Format"
+        )
+        export_btn = gr.Button("Export", variant="primary")
+        export_output = gr.Textbox(
+            label="Exported Data",
+            lines=15,
+            show_copy_button=True,
+        )
+        export_btn.click(
+            fn=export_results_fn,
+            inputs=[export_input, export_format],
+            outputs=[export_output]
+        )
     gr.Markdown("""
     ---
+    **Notes:** Estimates are approximate. Flash Attention and other optimizations can reduce peak memory.
+    Throughput estimates assume ideal conditions. Built with Gradio & HuggingFace Hub API.
     """)