Spaces:

Livengood
/

Instance-VRAM-Calculator

Running

App Files Files Community

Livengood Claude commited on Nov 29, 2025

Commit

50bc6be

1 Parent(s): 6f1d86c

Add training mode, multi-GPU support, expanded GPU database, quantization breakdown, and visual memory chart

Browse files

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>

Files changed (2) hide show

app.py +436 -134
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -2,28 +2,45 @@
 VRAM & Instance Type Calculator for HuggingFace Models
 Fetches model metadata from HF Hub and calculates:
-- Minimum VRAM required for inference
 - KV cache requirements at various context lengths
 - Recommended GPUs and cloud instances
 """
 import gradio as gr
 from huggingface_hub import HfApi, hf_hub_download
 import json
-import math
 # Initialize HF API client
 api = HfApi()
-# GPU specs: name -> (VRAM in GB, typical cloud instance)
 GPU_SPECS = {
-    "RTX 3090": (24, "Consumer"),
-    "RTX 4090": (24, "Consumer"),
-    "A10G": (24, "AWS g5.xlarge (~$1/hr)"),
-    "L4": (24, "GCP g2-standard-4 (~$0.70/hr)"),
-    "A100 40GB": (40, "AWS p4d.24xlarge, GCP a2-highgpu-1g"),
-    "A100 80GB": (80, "AWS p4de.24xlarge, GCP a2-ultragpu-1g"),
-    "H100 80GB": (80, "AWS p5.48xlarge, GCP a3-highgpu-8g"),
 }
 # Bytes per element for different dtypes
@@ -38,36 +55,79 @@ DTYPE_BYTES = {
     "I64": 8, "int64": 8,
 }
-def bytes_to_gb(b: int) -> float:
     return b / (1024 ** 3)
-def get_model_info(model_id: str) -> dict:
-    """Fetch model info from HF Hub."""
     try:
         info = api.model_info(model_id, files_metadata=True)
         return info
     except Exception as e:
-        raise gr.Error(f"Could not fetch model info: {e}")
-def get_config(model_id: str) -> dict:
-    """Try to fetch config.json for architecture details."""
     try:
         config_path = hf_hub_download(model_id, "config.json")
         with open(config_path) as f:
-            return json.load(f)
     except Exception as e:
-        # Gated models or missing config
-        return {"_error": str(e)}
 def estimate_params_from_safetensors(info) -> tuple[int, str]:
     """Extract parameter count and dtype from safetensors metadata."""
-    if info.safetensors:
         param_count = info.safetensors.total
-        # Get the dominant dtype
         params_by_dtype = info.safetensors.parameters
         if params_by_dtype:
             dominant_dtype = max(params_by_dtype, key=params_by_dtype.get)
@@ -75,152 +135,329 @@ def estimate_params_from_safetensors(info) -> tuple[int, str]:
     return 0, "F16"
 def estimate_kv_cache_size(
     num_layers: int,
-    hidden_size: int,
     num_kv_heads: int,
     context_length: int,
     batch_size: int = 1,
     dtype_bytes: int = 2
 ) -> int:
     """
     KV cache size = 2 * num_layers * batch_size * context_length * num_kv_heads * head_dim * dtype_bytes
-    head_dim = hidden_size / num_attention_heads (but we use hidden_size / num_kv_heads for GQA)
     """
-    # For GQA models, KV cache uses num_kv_heads, not num_attention_heads
-    # head_dim is typically hidden_size / num_attention_heads
-    # But KV cache stores: num_kv_heads * head_dim per layer
-    # Simplified: 2 * layers * batch * seq * hidden_size * (num_kv_heads / num_attn_heads) * dtype
-    # For non-GQA: num_kv_heads == num_attn_heads, so it's just 2 * layers * batch * seq * hidden
-    # More accurate: 2 (K+V) * layers * batch * seq * num_kv_heads * head_dim
-    # We'll estimate head_dim as hidden_size / num_kv_heads if we don't know num_attn_heads
-    # This is a rough estimate
-    head_dim = 128  # Common default (Llama, Mistral, etc.)
     kv_cache_bytes = 2 * num_layers * batch_size * context_length * num_kv_heads * head_dim * dtype_bytes
     return kv_cache_bytes
-def calculate_vram(model_id: str, context_length: int = 4096, batch_size: int = 1) -> str:
-    """Main calculation function."""
     # Fetch model info
     info = get_model_info(model_id)
     config = get_config(model_id)
     results = []
     results.append(f"## Model: [{model_id}](https://huggingface.co/{model_id})\n")
     # Get parameter count and dtype
     param_count, dominant_dtype = estimate_params_from_safetensors(info)
     if param_count == 0:
-        # Fallback: try to infer from model name or config
         results.append("⚠️ Could not determine parameter count from safetensors metadata.\n")
         results.append("Model may use pytorch_model.bin or other format.\n")
-        return "\n".join(results)
     dtype_bytes = DTYPE_BYTES.get(dominant_dtype, 2)
     params_b = param_count / 1e9
-    results.append(f"**Parameters:** {params_b:.2f}B")
-    results.append(f"**Dominant dtype:** {dominant_dtype} ({dtype_bytes} bytes)")
     # Model weights VRAM
     weights_bytes = param_count * dtype_bytes
     weights_gb = bytes_to_gb(weights_bytes)
-    results.append(f"\n### Weight Memory")
     results.append(f"Model weights: **{weights_gb:.2f} GB**")
-    # KV Cache estimation (if we have config)
     num_layers = config.get("num_hidden_layers", config.get("n_layer", 0))
     hidden_size = config.get("hidden_size", config.get("n_embd", 0))
-    num_kv_heads = config.get("num_key_value_heads", config.get("num_attention_heads", config.get("n_head", 0)))
-    results.append(f"\n### Architecture (from config.json)")
     if "_error" in config:
-        results.append(f"⚠️ Could not fetch config.json (model may be gated or config missing)")
-        results.append("KV cache calculation skipped - using weight-only estimate with 20% overhead")
     elif num_layers and hidden_size:
-        results.append(f"Layers: {num_layers}, Hidden size: {hidden_size}, KV Heads: {num_kv_heads}")
-        # Calculate KV cache for different context lengths
-        results.append(f"\n### KV Cache (batch_size={batch_size})")
-        results.append("| Context Length | KV Cache | Total VRAM |")
-        results.append("|----------------|----------|------------|")
-        for ctx_len in [2048, 4096, 8192, 16384, 32768, 65536, 131072]:
-            if ctx_len > context_length * 2:
                 break
             kv_bytes = estimate_kv_cache_size(
-                num_layers, hidden_size, num_kv_heads, ctx_len, batch_size, dtype_bytes
             )
-            kv_gb = bytes_to_gb(kv_bytes)
-            total_gb = weights_gb + kv_gb
-            marker = " ← selected" if ctx_len == context_length else ""
-            results.append(f"| {ctx_len:,} | {kv_gb:.2f} GB | **{total_gb:.2f} GB**{marker} |")
-    else:
-        results.append("Could not find architecture details in config.json")
-    # Calculate for user's selected context length
-    if num_layers and hidden_size and num_kv_heads:
         kv_bytes = estimate_kv_cache_size(
-            num_layers, hidden_size, num_kv_heads, context_length, batch_size, dtype_bytes
         )
         kv_gb = bytes_to_gb(kv_bytes)
-        total_inference_gb = weights_gb + kv_gb
     else:
-        total_inference_gb = weights_gb * 1.2  # 20% overhead estimate
-    # Add activation memory overhead (~10-20%)
-    total_with_overhead = total_inference_gb * 1.15
-    results.append(f"\n### Total VRAM Estimate")
-    results.append(f"Weights + KV Cache + Overhead (~15%): **{total_with_overhead:.2f} GB**")
     # GPU Recommendations
-    results.append(f"\n### Recommended GPUs")
-    results.append("| GPU | VRAM | Fits? | Cloud Instance |")
-    results.append("|-----|------|-------|----------------|")
-    for gpu_name, (vram, instance) in GPU_SPECS.items():
-        fits = "✅" if vram >= total_with_overhead else "❌"
-        headroom = vram - total_with_overhead
-        headroom_str = f"+{headroom:.1f}GB" if headroom > 0 else f"{headroom:.1f}GB"
-        results.append(f"| {gpu_name} | {vram}GB | {fits} ({headroom_str}) | {instance} |")
-    # Quantization suggestions
-    if total_with_overhead > 24:
-        results.append(f"\n### 💡 Quantization Options")
-        results.append("To fit on consumer GPUs (24GB), consider:")
-        q8_estimate = (param_count * 1) / (1024**3) * 1.15
-        q4_estimate = (param_count * 0.5) / (1024**3) * 1.15
-        results.append(f"- **INT8 quantization:** ~{q8_estimate:.1f} GB")
-        results.append(f"- **INT4 quantization:** ~{q4_estimate:.1f} GB")
-        results.append(f"\nLook for GGUF or AWQ versions of this model on HF Hub.")
-    return "\n".join(results)
 # Build Gradio interface
-with gr.Blocks(title="VRAM Calculator") as demo:
     gr.Markdown("""
     # 🧮 VRAM & Instance Type Calculator
-    Enter a HuggingFace model ID to estimate VRAM requirements and get GPU/cloud instance recommendations.
-    **How it works:** Fetches model metadata (safetensors info, config.json) to calculate memory for weights + KV cache.
     """)
     with gr.Row():
         with gr.Column(scale=2):
             model_input = gr.Textbox(
                 label="Model ID",
                 placeholder="meta-llama/Llama-3.1-8B",
-                info="Enter the full HuggingFace model ID (e.g., 'mistralai/Mistral-7B-v0.1')"
             )
         with gr.Column(scale=1):
             context_input = gr.Slider(
@@ -229,50 +466,115 @@ with gr.Blocks(title="VRAM Calculator") as demo:
                 maximum=131072,
                 value=4096,
                 step=512,
-                info="Max sequence length for KV cache calculation"
             )
         with gr.Column(scale=1):
             batch_input = gr.Slider(
                 label="Batch Size",
                 minimum=1,
-                maximum=32,
                 value=1,
                 step=1,
                 info="Concurrent sequences"
             )
-    calculate_btn = gr.Button("Calculate VRAM", variant="primary", size="lg")
-    output = gr.Markdown(label="Results")
     calculate_btn.click(
-        fn=calculate_vram,
-        inputs=[model_input, context_input, batch_input],
-        outputs=output
     )
     # Examples
     gr.Examples(
         examples=[
             ["meta-llama/Llama-3.1-8B", 4096, 1],
             ["mistralai/Mistral-7B-v0.1", 8192, 1],
             ["Qwen/Qwen2.5-72B", 32768, 1],
             ["google/gemma-2-27b", 8192, 1],
             ["microsoft/phi-4", 16384, 1],
         ],
         inputs=[model_input, context_input, batch_input],
-        label="Try these models"
     )
     gr.Markdown("""
     ---
-    **Notes:**
-    - VRAM estimates include ~15% overhead for activations and framework overhead
-    - KV cache assumes inference (not training)
-    - Actual requirements may vary based on serving framework (vLLM, TGI, etc.)
-    - For GGUF models, memory requirements differ significantly
-    Built with ❤️ using Gradio & HuggingFace Hub API
     """)

 VRAM & Instance Type Calculator for HuggingFace Models
 Fetches model metadata from HF Hub and calculates:
+- Minimum VRAM required for inference and training
 - KV cache requirements at various context lengths
 - Recommended GPUs and cloud instances
+- Multi-GPU tensor parallelism estimates
+- Quantization options with detailed breakdown
 """
 import gradio as gr
 from huggingface_hub import HfApi, hf_hub_download
 import json
+from functools import lru_cache
 # Initialize HF API client
 api = HfApi()
+# GPU specs: name -> (VRAM in GB, typical cloud instance, category)
 GPU_SPECS = {
+    # Consumer GPUs
+    "RTX 3080": (10, "Consumer", "consumer"),
+    "RTX 3090": (24, "Consumer", "consumer"),
+    "RTX 4080": (16, "Consumer", "consumer"),
+    "RTX 4090": (24, "Consumer", "consumer"),
+    "RTX 5090": (32, "Consumer (est.)", "consumer"),
+    # Apple Silicon
+    "M2 Ultra": (192, "Mac Studio (Unified)", "apple"),
+    "M3 Max": (128, "MacBook Pro (Unified)", "apple"),
+    "M4 Max": (128, "MacBook Pro (Unified)", "apple"),
+    # Workstation GPUs
+    "RTX A6000": (48, "Workstation", "workstation"),
+    "L40S": (48, "AWS g6.xlarge (~$1.00/hr)", "cloud"),
+    # Cloud GPUs
+    "A10G": (24, "AWS g5.xlarge (~$1.00/hr)", "cloud"),
+    "L4": (24, "GCP g2-standard-4 (~$0.70/hr)", "cloud"),
+    "A100 40GB": (40, "AWS p4d, GCP a2-highgpu-1g (~$3/hr)", "cloud"),
+    "A100 80GB": (80, "AWS p4de, GCP a2-ultragpu-1g (~$5/hr)", "cloud"),
+    "H100 80GB": (80, "AWS p5, GCP a3-highgpu (~$8/hr)", "cloud"),
+    "H200 141GB": (141, "Coming soon (~$12/hr est.)", "cloud"),
+    # AMD GPUs
+    "MI300X": (192, "AMD Cloud Instances", "amd"),
 }
 # Bytes per element for different dtypes
     "I64": 8, "int64": 8,
 }
+# Serving framework overhead multipliers
+SERVING_FRAMEWORKS = {
+    "None (raw PyTorch)": 1.20,
+    "vLLM": 1.10,
+    "TGI (Text Generation Inference)": 1.15,
+    "llama.cpp": 1.05,
+    "Transformers (HuggingFace)": 1.25,
+    "Ollama": 1.08,
+}
+# Quantization methods with their characteristics
+QUANTIZATION_METHODS = {
+    "FP16/BF16": {"bytes_per_param": 2.0, "quality": "100%", "desc": "Full precision"},
+    "INT8 (LLM.int8)": {"bytes_per_param": 1.0, "quality": "~99%", "desc": "Good balance"},
+    "GPTQ 8-bit": {"bytes_per_param": 1.0, "quality": "~99%", "desc": "GPU optimized"},
+    "AWQ 4-bit": {"bytes_per_param": 0.5, "quality": "~97%", "desc": "Activation-aware"},
+    "GPTQ 4-bit": {"bytes_per_param": 0.5, "quality": "~95%", "desc": "GPU optimized"},
+    "GGUF Q8_0": {"bytes_per_param": 1.0, "quality": "~99%", "desc": "llama.cpp format"},
+    "GGUF Q6_K": {"bytes_per_param": 0.75, "quality": "~98%", "desc": "llama.cpp format"},
+    "GGUF Q5_K_M": {"bytes_per_param": 0.625, "quality": "~97%", "desc": "llama.cpp format"},
+    "GGUF Q4_K_M": {"bytes_per_param": 0.5, "quality": "~95%", "desc": "llama.cpp format"},
+    "GGUF Q3_K_M": {"bytes_per_param": 0.375, "quality": "~90%", "desc": "llama.cpp format"},
+    "GGUF Q2_K": {"bytes_per_param": 0.3125, "quality": "~85%", "desc": "Aggressive compression"},
+}
+def bytes_to_gb(b: int | float) -> float:
     return b / (1024 ** 3)
+def gb_to_bytes(gb: float) -> float:
+    return gb * (1024 ** 3)
+@lru_cache(maxsize=50)
+def get_model_info_cached(model_id: str):
+    """Fetch model info from HF Hub with caching."""
     try:
         info = api.model_info(model_id, files_metadata=True)
         return info
     except Exception as e:
+        return {"_error": str(e)}
+@lru_cache(maxsize=50)
+def get_config_cached(model_id: str) -> str:
+    """Fetch config.json with caching. Returns JSON string for cache compatibility."""
     try:
         config_path = hf_hub_download(model_id, "config.json")
         with open(config_path) as f:
+            return f.read()
     except Exception as e:
+        return json.dumps({"_error": str(e)})
+def get_model_info(model_id: str):
+    """Fetch model info from HF Hub."""
+    result = get_model_info_cached(model_id)
+    if isinstance(result, dict) and "_error" in result:
+        raise gr.Error(f"Could not fetch model info: {result['_error']}")
+    return result
+def get_config(model_id: str) -> dict:
+    """Get config.json for architecture details."""
+    config_str = get_config_cached(model_id)
+    return json.loads(config_str)
 def estimate_params_from_safetensors(info) -> tuple[int, str]:
     """Extract parameter count and dtype from safetensors metadata."""
+    if hasattr(info, 'safetensors') and info.safetensors:
         param_count = info.safetensors.total
         params_by_dtype = info.safetensors.parameters
         if params_by_dtype:
             dominant_dtype = max(params_by_dtype, key=params_by_dtype.get)
     return 0, "F16"
+def get_head_dim(config: dict) -> int:
+    """Calculate head dimension from config, with fallbacks."""
+    # Try to get it directly
+    if "head_dim" in config:
+        return config["head_dim"]
+    # Calculate from hidden_size and num_attention_heads
+    hidden_size = config.get("hidden_size", config.get("n_embd", 0))
+    num_heads = config.get("num_attention_heads", config.get("n_head", 0))
+    if hidden_size and num_heads:
+        return hidden_size // num_heads
+    # Common defaults by model family
+    return 128  # Most common default
 def estimate_kv_cache_size(
     num_layers: int,
     num_kv_heads: int,
+    head_dim: int,
     context_length: int,
     batch_size: int = 1,
     dtype_bytes: int = 2
 ) -> int:
     """
     KV cache size = 2 * num_layers * batch_size * context_length * num_kv_heads * head_dim * dtype_bytes
+    The 2 accounts for both K and V caches.
     """
     kv_cache_bytes = 2 * num_layers * batch_size * context_length * num_kv_heads * head_dim * dtype_bytes
     return kv_cache_bytes
+def estimate_training_memory(
+    param_count: int,
+    dtype_bytes: int,
+    optimizer: str = "AdamW"
+) -> dict:
+    """
+    Estimate training memory requirements.
+    For training, we need:
+    - Model weights
+    - Gradients (same size as weights)
+    - Optimizer states (varies by optimizer)
+    - Activations (highly variable, estimated)
+    """
+    weights_bytes = param_count * dtype_bytes
+    gradients_bytes = param_count * dtype_bytes
+    # Optimizer states
+    if optimizer == "AdamW":
+        # AdamW stores: m (momentum), v (variance) in FP32
+        optimizer_bytes = param_count * 4 * 2  # 2 states, 4 bytes each
+    elif optimizer == "SGD":
+        optimizer_bytes = 0  # No extra state (momentum optional)
+    elif optimizer == "SGD + Momentum":
+        optimizer_bytes = param_count * 4  # Momentum buffer
+    elif optimizer == "8-bit Adam":
+        optimizer_bytes = param_count * 1 * 2  # 2 states, 1 byte each
+    else:
+        optimizer_bytes = param_count * 4 * 2  # Default to AdamW
+    return {
+        "weights": weights_bytes,
+        "gradients": gradients_bytes,
+        "optimizer": optimizer_bytes,
+        "total_base": weights_bytes + gradients_bytes + optimizer_bytes
+    }
+def calculate_multi_gpu_split(total_vram_gb: float, num_gpus: int, parallelism: str) -> dict:
+    """Calculate memory distribution across multiple GPUs."""
+    if parallelism == "Tensor Parallelism":
+        # Weights and KV cache split evenly
+        per_gpu = total_vram_gb / num_gpus
+        overhead = 0.05 * total_vram_gb  # Communication overhead
+        return {
+            "per_gpu": per_gpu + (overhead / num_gpus),
+            "total": total_vram_gb + overhead,
+            "efficiency": "High (best for inference)",
+        }
+    elif parallelism == "Pipeline Parallelism":
+        # Layers distributed, but activation memory at boundaries
+        per_gpu = total_vram_gb / num_gpus
+        overhead = 0.1 * total_vram_gb  # Activation memory overhead
+        return {
+            "per_gpu": per_gpu + (overhead / num_gpus),
+            "total": total_vram_gb + overhead,
+            "efficiency": "Medium (good for training)",
+        }
+    else:  # Data Parallelism
+        # Full model on each GPU
+        return {
+            "per_gpu": total_vram_gb,
+            "total": total_vram_gb * num_gpus,
+            "efficiency": "Low memory efficiency (training only)",
+        }
+def calculate_vram(
+    model_id: str,
+    context_length: int = 4096,
+    batch_size: int = 1,
+    mode: str = "Inference",
+    optimizer: str = "AdamW",
+    serving_framework: str = "None (raw PyTorch)",
+    num_gpus: int = 1,
+    parallelism: str = "Tensor Parallelism"
+) -> tuple[str, dict | None]:
+    """Main calculation function. Returns (markdown_results, chart_data)."""
+    # Validate inputs
+    model_id = model_id.strip()
+    if not model_id:
+        raise gr.Error("Please enter a model ID")
+    if "/" not in model_id:
+        raise gr.Error("Model ID should be in format 'organization/model-name'")
     # Fetch model info
     info = get_model_info(model_id)
     config = get_config(model_id)
     results = []
     results.append(f"## Model: [{model_id}](https://huggingface.co/{model_id})\n")
     # Get parameter count and dtype
     param_count, dominant_dtype = estimate_params_from_safetensors(info)
     if param_count == 0:
         results.append("⚠️ Could not determine parameter count from safetensors metadata.\n")
         results.append("Model may use pytorch_model.bin or other format.\n")
+        return "\n".join(results), None
     dtype_bytes = DTYPE_BYTES.get(dominant_dtype, 2)
     params_b = param_count / 1e9
+    results.append(f"**Parameters:** {params_b:.2f}B ({param_count:,})")
+    results.append(f"**Dominant dtype:** {dominant_dtype} ({dtype_bytes} bytes/param)")
+    results.append(f"**Mode:** {mode}")
     # Model weights VRAM
     weights_bytes = param_count * dtype_bytes
     weights_gb = bytes_to_gb(weights_bytes)
+    results.append(f"\n### 📦 Weight Memory")
     results.append(f"Model weights: **{weights_gb:.2f} GB**")
+    # Architecture details
     num_layers = config.get("num_hidden_layers", config.get("n_layer", 0))
     hidden_size = config.get("hidden_size", config.get("n_embd", 0))
+    num_attention_heads = config.get("num_attention_heads", config.get("n_head", 0))
+    num_kv_heads = config.get("num_key_value_heads", num_attention_heads)
+    head_dim = get_head_dim(config)
+    max_position = config.get("max_position_embeddings", config.get("n_positions", "N/A"))
+    results.append(f"\n### 🏗️ Architecture (from config.json)")
     if "_error" in config:
+        results.append(f"⚠️ Could not fetch config.json (model may be gated)")
+        kv_gb = 0
     elif num_layers and hidden_size:
+        results.append(f"- **Layers:** {num_layers}")
+        results.append(f"- **Hidden size:** {hidden_size}")
+        results.append(f"- **Attention heads:** {num_attention_heads}")
+        results.append(f"- **KV heads:** {num_kv_heads} {'(GQA)' if num_kv_heads != num_attention_heads else '(MHA)'}")
+        results.append(f"- **Head dimension:** {head_dim}")
+        results.append(f"- **Max context:** {max_position:,}" if isinstance(max_position, int) else f"- **Max context:** {max_position}")
+        # KV Cache calculation
+        results.append(f"\n### 💾 KV Cache (batch_size={batch_size})")
+        results.append("| Context | KV Cache | + Weights | Status |")
+        results.append("|---------|----------|-----------|--------|")
+        # Show relevant context lengths
+        context_points = [1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072]
+        for ctx_len in context_points:
+            if ctx_len > context_length * 2 and ctx_len > 8192:
                 break
             kv_bytes = estimate_kv_cache_size(
+                num_layers, num_kv_heads, head_dim, ctx_len, batch_size, dtype_bytes
             )
+            kv_gb_temp = bytes_to_gb(kv_bytes)
+            total_temp = weights_gb + kv_gb_temp
+            marker = " **← selected**" if ctx_len == context_length else ""
+            results.append(f"| {ctx_len:,} | {kv_gb_temp:.2f} GB | {total_temp:.2f} GB |{marker} |")
+        # Calculate for selected context
         kv_bytes = estimate_kv_cache_size(
+            num_layers, num_kv_heads, head_dim, context_length, batch_size, dtype_bytes
         )
         kv_gb = bytes_to_gb(kv_bytes)
     else:
+        results.append("Could not find architecture details")
+        kv_gb = 0
+    # Calculate total based on mode
+    if mode == "Training":
+        training_mem = estimate_training_memory(param_count, dtype_bytes, optimizer)
+        base_gb = bytes_to_gb(training_mem["total_base"])
+        # Activations estimation (rough: ~2x weights for typical batch)
+        activation_gb = weights_gb * 2 * batch_size
+        total_gb = base_gb + kv_gb + activation_gb
+        results.append(f"\n### 🎓 Training Memory Breakdown")
+        results.append(f"- **Weights:** {weights_gb:.2f} GB")
+        results.append(f"- **Gradients:** {bytes_to_gb(training_mem['gradients']):.2f} GB")
+        results.append(f"- **Optimizer ({optimizer}):** {bytes_to_gb(training_mem['optimizer']):.2f} GB")
+        results.append(f"- **KV Cache:** {kv_gb:.2f} GB")
+        results.append(f"- **Activations (est.):** {activation_gb:.2f} GB")
+        chart_data = {
+            "Weights": weights_gb,
+            "Gradients": bytes_to_gb(training_mem['gradients']),
+            "Optimizer": bytes_to_gb(training_mem['optimizer']),
+            "KV Cache": kv_gb,
+            "Activations": activation_gb,
+        }
+    else:
+        # Inference mode
+        framework_overhead = SERVING_FRAMEWORKS.get(serving_framework, 1.15)
+        base_total = weights_gb + kv_gb
+        overhead_gb = base_total * (framework_overhead - 1)
+        total_gb = base_total + overhead_gb
+        results.append(f"\n### ⚡ Inference Memory ({serving_framework})")
+        results.append(f"- **Weights:** {weights_gb:.2f} GB")
+        results.append(f"- **KV Cache:** {kv_gb:.2f} GB")
+        results.append(f"- **Framework overhead:** {overhead_gb:.2f} GB ({(framework_overhead-1)*100:.0f}%)")
+        chart_data = {
+            "Weights": weights_gb,
+            "KV Cache": kv_gb,
+            "Overhead": overhead_gb,
+        }
+    results.append(f"\n### 📊 Total VRAM Required: **{total_gb:.2f} GB**")
+    # Multi-GPU calculations
+    if num_gpus > 1:
+        multi_gpu = calculate_multi_gpu_split(total_gb, num_gpus, parallelism)
+        results.append(f"\n### 🔗 Multi-GPU ({num_gpus}x GPUs, {parallelism})")
+        results.append(f"- **Per GPU:** {multi_gpu['per_gpu']:.2f} GB")
+        results.append(f"- **Total across GPUs:** {multi_gpu['total']:.2f} GB")
+        results.append(f"- **Efficiency:** {multi_gpu['efficiency']}")
+        # Update total for GPU recommendations
+        effective_vram_needed = multi_gpu['per_gpu']
+    else:
+        effective_vram_needed = total_gb
     # GPU Recommendations
+    results.append(f"\n### 🎮 GPU Recommendations")
+    results.append("| GPU | VRAM | Fits? | Headroom | Instance |")
+    results.append("|-----|------|-------|----------|----------|")
+    for gpu_name, (vram, instance, category) in GPU_SPECS.items():
+        fits = "✅" if vram >= effective_vram_needed else "❌"
+        headroom = vram - effective_vram_needed
+        headroom_str = f"+{headroom:.1f} GB" if headroom > 0 else f"{headroom:.1f} GB"
+        results.append(f"| {gpu_name} | {vram} GB | {fits} | {headroom_str} | {instance} |")
+    # Quantization options (if model doesn't fit on consumer GPUs)
+    if effective_vram_needed > 24:
+        results.append(f"\n### 🗜️ Quantization Options")
+        results.append("To fit on consumer GPUs (≤24 GB), consider these options:\n")
+        results.append("| Method | Est. Size | Quality | Notes |")
+        results.append("|--------|-----------|---------|-------|")
+        for method, specs in QUANTIZATION_METHODS.items():
+            quant_size = bytes_to_gb(param_count * specs["bytes_per_param"])
+            quant_with_overhead = quant_size * 1.1  # Small overhead
+            fits = "✅" if quant_with_overhead <= 24 else "❌"
+            results.append(f"| {method} | {quant_with_overhead:.1f} GB | {specs['quality']} | {fits} {specs['desc']} |")
+        results.append(f"\n**Tip:** Search for `{model_id.split('/')[-1]} GGUF` or `{model_id.split('/')[-1]} AWQ` on HuggingFace.")
+    return "\n".join(results), chart_data
+def create_memory_chart(chart_data: dict | None):
+    """Create a bar chart for memory breakdown."""
+    if not chart_data:
+        return None
+    labels = list(chart_data.keys())
+    values = list(chart_data.values())
+    return gr.BarPlot(
+        value={"Component": labels, "GB": values},
+        x="Component",
+        y="GB",
+        title="Memory Breakdown",
+        height=300,
+        width=400,
+    )
 # Build Gradio interface
+with gr.Blocks(title="VRAM Calculator", theme=gr.themes.Soft()) as demo:
     gr.Markdown("""
     # 🧮 VRAM & Instance Type Calculator
+    Estimate GPU memory requirements for HuggingFace models. Supports inference and training modes,
+    multi-GPU setups, and provides detailed quantization recommendations.
     """)
     with gr.Row():
         with gr.Column(scale=2):
             model_input = gr.Textbox(
                 label="Model ID",
                 placeholder="meta-llama/Llama-3.1-8B",
+                info="Full HuggingFace model ID (org/model-name)"
+            )
+    with gr.Row():
+        with gr.Column(scale=1):
+            mode_input = gr.Radio(
+                choices=["Inference", "Training"],
+                value="Inference",
+                label="Mode",
+                info="Training requires ~4x more memory"
             )
         with gr.Column(scale=1):
             context_input = gr.Slider(
                 maximum=131072,
                 value=4096,
                 step=512,
+                info="Sequence length for KV cache"
             )
         with gr.Column(scale=1):
             batch_input = gr.Slider(
                 label="Batch Size",
                 minimum=1,
+                maximum=64,
                 value=1,
                 step=1,
                 info="Concurrent sequences"
             )
+    with gr.Accordion("⚙️ Advanced Options", open=False):
+        with gr.Row():
+            with gr.Column():
+                serving_input = gr.Dropdown(
+                    choices=list(SERVING_FRAMEWORKS.keys()),
+                    value="None (raw PyTorch)",
+                    label="Serving Framework",
+                    info="Different frameworks have different overhead"
+                )
+                optimizer_input = gr.Dropdown(
+                    choices=["AdamW", "SGD", "SGD + Momentum", "8-bit Adam"],
+                    value="AdamW",
+                    label="Optimizer (Training mode)",
+                    info="Optimizer state memory varies"
+                )
+            with gr.Column():
+                num_gpus_input = gr.Slider(
+                    label="Number of GPUs",
+                    minimum=1,
+                    maximum=8,
+                    value=1,
+                    step=1,
+                    info="For multi-GPU setups"
+                )
+                parallelism_input = gr.Dropdown(
+                    choices=["Tensor Parallelism", "Pipeline Parallelism", "Data Parallelism"],
+                    value="Tensor Parallelism",
+                    label="Parallelism Strategy",
+                    info="How to distribute across GPUs"
+                )
+    calculate_btn = gr.Button("🚀 Calculate VRAM", variant="primary", size="lg")
+    with gr.Row():
+        with gr.Column(scale=3):
+            output = gr.Markdown(label="Results")
+        with gr.Column(scale=1):
+            chart_output = gr.BarPlot(
+                x="Component",
+                y="GB",
+                title="Memory Breakdown",
+                height=350,
+            )
+    def run_calculation(model_id, context_length, batch_size, mode, optimizer, serving, num_gpus, parallelism):
+        result_text, chart_data = calculate_vram(
+            model_id, context_length, batch_size, mode, optimizer, serving, num_gpus, parallelism
+        )
+        if chart_data:
+            import pandas as pd
+            df = pd.DataFrame({
+                "Component": list(chart_data.keys()),
+                "GB": list(chart_data.values())
+            })
+            return result_text, df
+        return result_text, None
     calculate_btn.click(
+        fn=run_calculation,
+        inputs=[
+            model_input, context_input, batch_input, mode_input,
+            optimizer_input, serving_input, num_gpus_input, parallelism_input
+        ],
+        outputs=[output, chart_output]
     )
     # Examples
     gr.Examples(
         examples=[
             ["meta-llama/Llama-3.1-8B", 4096, 1],
+            ["meta-llama/Llama-3.1-70B", 8192, 1],
             ["mistralai/Mistral-7B-v0.1", 8192, 1],
             ["Qwen/Qwen2.5-72B", 32768, 1],
             ["google/gemma-2-27b", 8192, 1],
             ["microsoft/phi-4", 16384, 1],
+            ["deepseek-ai/DeepSeek-V3", 4096, 1],
+            ["meta-llama/Llama-3.3-70B-Instruct", 8192, 1],
         ],
         inputs=[model_input, context_input, batch_input],
+        label="🔥 Popular Models"
     )
     gr.Markdown("""
     ---
+    ### 📝 Notes
+    - **Inference mode:** Weights + KV cache + framework overhead
+    - **Training mode:** Adds gradients, optimizer states, and activation memory
+    - **KV cache:** Scales linearly with context length and batch size
+    - **Multi-GPU:** Tensor parallelism splits memory; data parallelism replicates it
+    - **Quantization:** GGUF/AWQ/GPTQ can reduce memory 2-8x with minimal quality loss
+    ### ⚠️ Disclaimers
+    - Estimates are approximate; actual usage varies by implementation
+    - Flash Attention and other optimizations can significantly reduce memory
+    - GGUF models have different memory profiles than safetensors
+    Built with 💜 using Gradio & HuggingFace Hub API
     """)

requirements.txt CHANGED Viewed

@@ -1,2 +1,3 @@
 gradio>=4.44.0
 huggingface_hub>=0.20.0,<1.0.0

 gradio>=4.44.0
 huggingface_hub>=0.20.0,<1.0.0
+pandas>=2.0.0