Spaces:

atharv6f
/

flash-attention-explorer

Sleeping

a0y0346 commited on Feb 5

Commit

509d7b6

1 Parent(s): 6492c04

Add separate KV Cache dtype selector (FP16/BF16/FP8/INT8)

- KV cache precision can now be configured independently from weight precision
- Realistic representation: model weights and KV cache have different quantization options
- FP8/INT8 KV cache halves memory vs FP16, enabling 2x longer contexts
- Updated insight panel to show KV cache memory savings
- Added more batch sizes (64, 128) to tradeoff chart

Files changed (2) hide show

app.py +9 -3
src/memory_budget.py +45 -20

app.py CHANGED Viewed

@@ -597,6 +597,12 @@ def create_app() -> gr.Blocks:
                         label="Weight Precision",
                         info="Data type for model weights",
                     )
                 with gr.Row():
                     budget_batch = gr.Slider(
@@ -629,11 +635,11 @@ def create_app() -> gr.Blocks:
                 # Memory analysis callback
                 @spaces.GPU(duration=60)
-                def analyze_memory(model_name: str, seq_len: int, batch_size: int, dtype: str):
                     """Run memory analysis on selected model."""
                     try:
                         budget, breakdown_fig, scaling_fig, tradeoff_fig, insight = run_memory_analysis(
-                            model_name, int(seq_len), int(batch_size), dtype
                         )
                         status = f"**{model_name}**: {budget['breakdown']['total_gb']:.2f} GB total ({budget['utilization_pct']:.1f}% GPU)"
                         return status, breakdown_fig, scaling_fig, tradeoff_fig, insight
@@ -643,7 +649,7 @@ def create_app() -> gr.Blocks:
                 calculate_btn.click(
                     fn=analyze_memory,
-                    inputs=[budget_model, budget_seq, budget_batch, budget_dtype],
                     outputs=[budget_status, breakdown_chart, scaling_chart, tradeoff_chart, budget_insight],
                 )

                         label="Weight Precision",
                         info="Data type for model weights",
                     )
+                    budget_kv_dtype = gr.Dropdown(
+                        choices=["FP16", "BF16", "FP8", "INT8"],
+                        value="FP16",
+                        label="KV Cache Precision",
+                        info="Data type for KV cache (FP8/INT8 need compatible hardware)",
+                    )
                 with gr.Row():
                     budget_batch = gr.Slider(
                 # Memory analysis callback
                 @spaces.GPU(duration=60)
+                def analyze_memory(model_name: str, seq_len: int, batch_size: int, dtype: str, kv_dtype: str):
                     """Run memory analysis on selected model."""
                     try:
                         budget, breakdown_fig, scaling_fig, tradeoff_fig, insight = run_memory_analysis(
+                            model_name, int(seq_len), int(batch_size), dtype, kv_dtype
                         )
                         status = f"**{model_name}**: {budget['breakdown']['total_gb']:.2f} GB total ({budget['utilization_pct']:.1f}% GPU)"
                         return status, breakdown_fig, scaling_fig, tradeoff_fig, insight
                 calculate_btn.click(
                     fn=analyze_memory,
+                    inputs=[budget_model, budget_seq, budget_batch, budget_dtype, budget_kv_dtype],
                     outputs=[budget_status, breakdown_chart, scaling_chart, tradeoff_chart, budget_insight],
                 )

src/memory_budget.py CHANGED Viewed

@@ -129,11 +129,21 @@ def calculate_kv_cache_memory(
     }
 def calculate_memory_budget(
     model_name: str,
     seq_len: int,
     batch_size: int = 1,
     dtype: str = "FP16",
 ) -> dict:
     """
     Calculate complete memory budget using REAL model and GPU info.
@@ -143,6 +153,7 @@ def calculate_memory_budget(
         seq_len: Target sequence length
         batch_size: Batch size
         dtype: Data type for model weights
     Returns:
         Complete memory budget breakdown
@@ -158,14 +169,15 @@ def calculate_memory_budget(
     model_weights_bytes = model_info["num_parameters"] * dtype_bytes
     model_weights_gb = model_weights_bytes / (1024 ** 3)
-    # KV cache memory
     kv_cache = calculate_kv_cache_memory(
         num_kv_heads=model_info["num_kv_heads"],
         head_dim=model_info["head_dim"],
         num_layers=model_info["num_layers"],
         seq_len=seq_len,
         batch_size=batch_size,
-        dtype_bytes=int(dtype_bytes) if dtype_bytes >= 1 else 2,  # KV cache usually FP16
     )
     kv_cache_gb = kv_cache["gb"]
@@ -187,6 +199,7 @@ def calculate_memory_budget(
         "model_info": model_info,
         "gpu_info": gpu_info,
         "dtype": dtype,
         "seq_len": seq_len,
         "batch_size": batch_size,
         "breakdown": {
@@ -206,6 +219,7 @@ def calculate_max_context_length(
     model_name: str,
     batch_size: int = 1,
     dtype: str = "FP16",
     memory_reserve_pct: float = 10.0,
 ) -> dict:
     """
@@ -217,6 +231,7 @@ def calculate_max_context_length(
         model_name: Model to analyze
         batch_size: Batch size
         dtype: Data type for weights
         memory_reserve_pct: Percentage to reserve for activations/overhead
     Returns:
@@ -251,7 +266,7 @@ def calculate_max_context_length(
     # Calculate max seq_len from available KV cache memory
     # KV cache bytes = 2 × num_kv_heads × head_dim × seq_len × num_layers × batch × dtype_bytes
-    kv_dtype_bytes = 2  # KV cache typically FP16
     bytes_per_token = (
         2 * model_info["num_kv_heads"] * model_info["head_dim"] *
         model_info["num_layers"] * batch_size * kv_dtype_bytes
@@ -369,7 +384,7 @@ def create_memory_breakdown_chart(budget: dict) -> go.Figure:
     return fig
-def create_context_scaling_chart(model_name: str, batch_size: int = 1, dtype: str = "FP16") -> go.Figure:
     """
     Create chart showing memory usage vs context length.
@@ -382,6 +397,9 @@ def create_context_scaling_chart(model_name: str, batch_size: int = 1, dtype: st
     dtype_bytes = {"FP32": 4, "FP16": 2, "BF16": 2, "INT8": 1, "INT4": 0.5}.get(dtype, 2)
     model_weights_gb = (model_info["num_parameters"] * dtype_bytes) / (1024 ** 3)
     # Sequence lengths to plot
     seq_lengths = [512, 1024, 2048, 4096, 8192, 16384, 32768]
@@ -395,6 +413,7 @@ def create_context_scaling_chart(model_name: str, batch_size: int = 1, dtype: st
             num_layers=model_info["num_layers"],
             seq_len=seq_len,
             batch_size=batch_size,
         )
         kv_cache_values.append(kv["gb"])
         total_memory_values.append(model_weights_gb + kv["gb"])
@@ -439,7 +458,7 @@ def create_context_scaling_chart(model_name: str, batch_size: int = 1, dtype: st
     fig.update_layout(
         title=dict(
             text=f"Memory Scaling: {model_name}<br>"
-                 f"<sub>batch={batch_size}, {dtype}, {model_info['num_kv_heads']} KV heads</sub>",
             x=0.5,
         ),
         xaxis_title="Context Length (tokens)",
@@ -466,7 +485,7 @@ def create_context_scaling_chart(model_name: str, batch_size: int = 1, dtype: st
     return fig
-def create_batch_context_tradeoff_chart(model_name: str, dtype: str = "FP16") -> go.Figure:
     """
     Create chart showing batch size vs max context tradeoff.
@@ -487,11 +506,11 @@ def create_batch_context_tradeoff_chart(model_name: str, dtype: str = "FP16") ->
         )
         return fig
-    batch_sizes = [1, 2, 4, 8, 16, 32]
     max_contexts = []
     for batch in batch_sizes:
-        result = calculate_max_context_length(model_name, batch_size=batch, dtype=dtype)
         max_contexts.append(result.get("max_context", 0))
     fig = go.Figure()
@@ -517,7 +536,7 @@ def create_batch_context_tradeoff_chart(model_name: str, dtype: str = "FP16") ->
     fig.update_layout(
         title=dict(
             text=f"Batch Size vs Max Context: {model_name}<br>"
-                 f"<sub>GPU: {gpu_info['name']} ({gpu_info['total_memory_gb']:.1f} GB)</sub>",
             x=0.5,
         ),
         xaxis_title="Batch Size",
@@ -538,6 +557,7 @@ def run_memory_analysis(
     seq_len: int,
     batch_size: int = 1,
     dtype: str = "FP16",
 ) -> tuple:
     """
     Run complete memory analysis for a model.
@@ -545,15 +565,15 @@ def run_memory_analysis(
     Returns budget info, charts, and insight text.
     """
     # Calculate budget
-    budget = calculate_memory_budget(model_name, seq_len, batch_size, dtype)
     # Calculate max context
-    max_context = calculate_max_context_length(model_name, batch_size, dtype)
     # Create charts
     breakdown_chart = create_memory_breakdown_chart(budget)
-    scaling_chart = create_context_scaling_chart(model_name, batch_size, dtype)
-    tradeoff_chart = create_batch_context_tradeoff_chart(model_name, dtype)
     # Generate insight text
     model_info = budget["model_info"]
@@ -562,6 +582,10 @@ def run_memory_analysis(
     status_emoji = "✅" if budget["fits_in_gpu"] else "❌"
     insight = f"""### {model_name} Memory Analysis
 **Model Configuration (from model.config):**
@@ -578,12 +602,12 @@ def run_memory_analysis(
 ### Memory Breakdown at {seq_len:,} tokens (batch={batch_size})
-| Component | Memory |
-|-----------|--------|
-| Model Weights ({dtype}) | {breakdown['model_weights_gb']:.2f} GB |
-| KV Cache | {breakdown['kv_cache_gb']:.2f} GB |
-| Activations (est.) | {breakdown['activation_gb']:.2f} GB |
-| **Total** | **{breakdown['total_gb']:.2f} GB** |
 **GPU Utilization:** {budget['utilization_pct']:.1f}%
 **Headroom:** {budget['headroom_gb']:.2f} GB {status_emoji}
@@ -592,7 +616,7 @@ def run_memory_analysis(
 ### Maximum Context Length
-At batch size {batch_size} with {dtype}:
 - **Max context:** {max_context.get('max_context', 0):,} tokens
 - Available for KV cache: {max_context.get('available_for_kv_gb', 0):.2f} GB
 - KV cache per token: {max_context.get('bytes_per_token', 0):,} bytes
@@ -605,6 +629,7 @@ At batch size {batch_size} with {dtype}:
 - **GQA reduces** KV cache by {model_info['gqa_ratio']}× vs MHA
 - **Batch size trades off** with maximum context length
 - **{dtype} weights** use {breakdown['model_weights_gb']:.2f} GB
 """
     return budget, breakdown_chart, scaling_chart, tradeoff_chart, insight

     }
+# KV Cache dtype bytes mapping
+KV_DTYPE_BYTES = {
+    "FP16": 2,
+    "BF16": 2,
+    "FP8": 1,
+    "INT8": 1,
+}
 def calculate_memory_budget(
     model_name: str,
     seq_len: int,
     batch_size: int = 1,
     dtype: str = "FP16",
+    kv_dtype: str = "FP16",
 ) -> dict:
     """
     Calculate complete memory budget using REAL model and GPU info.
         seq_len: Target sequence length
         batch_size: Batch size
         dtype: Data type for model weights
+        kv_dtype: Data type for KV cache (FP16, BF16, FP8, INT8)
     Returns:
         Complete memory budget breakdown
     model_weights_bytes = model_info["num_parameters"] * dtype_bytes
     model_weights_gb = model_weights_bytes / (1024 ** 3)
+    # KV cache memory - uses separate kv_dtype
+    kv_dtype_bytes = KV_DTYPE_BYTES.get(kv_dtype, 2)
     kv_cache = calculate_kv_cache_memory(
         num_kv_heads=model_info["num_kv_heads"],
         head_dim=model_info["head_dim"],
         num_layers=model_info["num_layers"],
         seq_len=seq_len,
         batch_size=batch_size,
+        dtype_bytes=kv_dtype_bytes,
     )
     kv_cache_gb = kv_cache["gb"]
         "model_info": model_info,
         "gpu_info": gpu_info,
         "dtype": dtype,
+        "kv_dtype": kv_dtype,
         "seq_len": seq_len,
         "batch_size": batch_size,
         "breakdown": {
     model_name: str,
     batch_size: int = 1,
     dtype: str = "FP16",
+    kv_dtype: str = "FP16",
     memory_reserve_pct: float = 10.0,
 ) -> dict:
     """
         model_name: Model to analyze
         batch_size: Batch size
         dtype: Data type for weights
+        kv_dtype: Data type for KV cache (FP16, BF16, FP8, INT8)
         memory_reserve_pct: Percentage to reserve for activations/overhead
     Returns:
     # Calculate max seq_len from available KV cache memory
     # KV cache bytes = 2 × num_kv_heads × head_dim × seq_len × num_layers × batch × dtype_bytes
+    kv_dtype_bytes = KV_DTYPE_BYTES.get(kv_dtype, 2)
     bytes_per_token = (
         2 * model_info["num_kv_heads"] * model_info["head_dim"] *
         model_info["num_layers"] * batch_size * kv_dtype_bytes
     return fig
+def create_context_scaling_chart(model_name: str, batch_size: int = 1, dtype: str = "FP16", kv_dtype: str = "FP16") -> go.Figure:
     """
     Create chart showing memory usage vs context length.
     dtype_bytes = {"FP32": 4, "FP16": 2, "BF16": 2, "INT8": 1, "INT4": 0.5}.get(dtype, 2)
     model_weights_gb = (model_info["num_parameters"] * dtype_bytes) / (1024 ** 3)
+    # KV cache dtype bytes
+    kv_dtype_bytes = KV_DTYPE_BYTES.get(kv_dtype, 2)
     # Sequence lengths to plot
     seq_lengths = [512, 1024, 2048, 4096, 8192, 16384, 32768]
             num_layers=model_info["num_layers"],
             seq_len=seq_len,
             batch_size=batch_size,
+            dtype_bytes=kv_dtype_bytes,
         )
         kv_cache_values.append(kv["gb"])
         total_memory_values.append(model_weights_gb + kv["gb"])
     fig.update_layout(
         title=dict(
             text=f"Memory Scaling: {model_name}<br>"
+                 f"<sub>batch={batch_size}, Weights: {dtype}, KV Cache: {kv_dtype}</sub>",
             x=0.5,
         ),
         xaxis_title="Context Length (tokens)",
     return fig
+def create_batch_context_tradeoff_chart(model_name: str, dtype: str = "FP16", kv_dtype: str = "FP16") -> go.Figure:
     """
     Create chart showing batch size vs max context tradeoff.
         )
         return fig
+    batch_sizes = [1, 2, 4, 8, 16, 32, 64, 128]
     max_contexts = []
     for batch in batch_sizes:
+        result = calculate_max_context_length(model_name, batch_size=batch, dtype=dtype, kv_dtype=kv_dtype)
         max_contexts.append(result.get("max_context", 0))
     fig = go.Figure()
     fig.update_layout(
         title=dict(
             text=f"Batch Size vs Max Context: {model_name}<br>"
+                 f"<sub>GPU: {gpu_info['name']}, Weights: {dtype}, KV: {kv_dtype}</sub>",
             x=0.5,
         ),
         xaxis_title="Batch Size",
     seq_len: int,
     batch_size: int = 1,
     dtype: str = "FP16",
+    kv_dtype: str = "FP16",
 ) -> tuple:
     """
     Run complete memory analysis for a model.
     Returns budget info, charts, and insight text.
     """
     # Calculate budget
+    budget = calculate_memory_budget(model_name, seq_len, batch_size, dtype, kv_dtype)
     # Calculate max context
+    max_context = calculate_max_context_length(model_name, batch_size, dtype, kv_dtype)
     # Create charts
     breakdown_chart = create_memory_breakdown_chart(budget)
+    scaling_chart = create_context_scaling_chart(model_name, batch_size, dtype, kv_dtype)
+    tradeoff_chart = create_batch_context_tradeoff_chart(model_name, dtype, kv_dtype)
     # Generate insight text
     model_info = budget["model_info"]
     status_emoji = "✅" if budget["fits_in_gpu"] else "❌"
+    # Calculate KV cache memory savings from quantization
+    kv_bytes = KV_DTYPE_BYTES.get(kv_dtype, 2)
+    kv_savings = (2 - kv_bytes) / 2 * 100 if kv_bytes < 2 else 0
     insight = f"""### {model_name} Memory Analysis
 **Model Configuration (from model.config):**
 ### Memory Breakdown at {seq_len:,} tokens (batch={batch_size})
+| Component | Memory | Precision |
+|-----------|--------|-----------|
+| Model Weights | {breakdown['model_weights_gb']:.2f} GB | {dtype} |
+| KV Cache | {breakdown['kv_cache_gb']:.2f} GB | {kv_dtype} |
+| Activations (est.) | {breakdown['activation_gb']:.2f} GB | - |
+| **Total** | **{breakdown['total_gb']:.2f} GB** | |
 **GPU Utilization:** {budget['utilization_pct']:.1f}%
 **Headroom:** {budget['headroom_gb']:.2f} GB {status_emoji}
 ### Maximum Context Length
+At batch size {batch_size} with {dtype} weights, {kv_dtype} KV cache:
 - **Max context:** {max_context.get('max_context', 0):,} tokens
 - Available for KV cache: {max_context.get('available_for_kv_gb', 0):.2f} GB
 - KV cache per token: {max_context.get('bytes_per_token', 0):,} bytes
 - **GQA reduces** KV cache by {model_info['gqa_ratio']}× vs MHA
 - **Batch size trades off** with maximum context length
 - **{dtype} weights** use {breakdown['model_weights_gb']:.2f} GB
+- **{kv_dtype} KV cache**{f" saves {kv_savings:.0f}% vs FP16" if kv_savings > 0 else " (baseline precision)"}
 """
     return budget, breakdown_chart, scaling_chart, tradeoff_chart, insight