Spaces:

atharv6f
/

flash-attention-explorer

Sleeping

a0y0346 commited on Feb 5

Commit

af9b854

1 Parent(s): c30936f

Refactor benchmarks to use real model.config values

- Add get_real_model_config() to extract config from model.config
- Refactor run_prefill_benchmark to use F.scaled_dot_product_attention
- Refactor run_decode_benchmark with proper KV cache and GQA support
- Update create_kv_cache_chart to use model.config (no constants)
- All config values now come from actual loaded models

Files changed (1) hide show

src/prefill_decode.py +334 -88

src/prefill_decode.py CHANGED Viewed

@@ -24,6 +24,38 @@ from .attention_utils import (
 )
 def run_prefill_with_real_model(
     model,
     attention_layer,
@@ -86,6 +118,110 @@ def run_prefill_with_real_model(
     return result
 def run_decode_with_real_model(
     model,
     attention_layer,
@@ -193,6 +329,128 @@ def run_decode_with_real_model(
         }
 # Legacy function kept for backwards compatibility
 def simulate_prefill_attention(
     batch_size: int,
@@ -347,95 +605,59 @@ def run_prefill_decode_comparison(
     """
     Run full comparison between prefill and decode phases using REAL HuggingFace model.
-    Loads the actual model, extracts the attention layer, and benchmarks
-    real attention operations for both prefill and decode phases.
     Returns results dict, comparison chart, KV cache chart, and insight text.
     """
     if model_name not in MODEL_CONFIGS:
         return {"error": f"Unknown model: {model_name}"}, None, None, "Error: Unknown model"
-    config = MODEL_CONFIGS[model_name]
     results = {
         "model": model_name,
         "context_length": context_length,
         "decode_tokens": decode_tokens,
-        "config": config,
-        "using_real_model": True,
     }
-    try:
-        # Load the REAL HuggingFace model
-        model = load_model(model_name)
-        # Extract attention layer from layer 0
-        attention_layer = extract_attention_layer(model, layer_idx=0)
-        # Get model attention info
-        attn_info = get_model_attention_info(model)
-        results["model_info"] = attn_info
-        # Run prefill benchmarks with REAL model attention
-        prefill_flash = run_prefill_with_real_model(
-            model=model,
-            attention_layer=attention_layer,
-            seq_len=context_length,
-            batch_size=1,
-            use_flash=True,
-        )
-        prefill_math = run_prefill_with_real_model(
-            model=model,
-            attention_layer=attention_layer,
-            seq_len=context_length,
-            batch_size=1,
-            use_flash=False,
-        )
-        # Run decode benchmarks with REAL model attention
-        decode_flash = run_decode_with_real_model(
-            model=model,
-            attention_layer=attention_layer,
-            kv_cache_len=context_length,
-            num_tokens=decode_tokens,
-            batch_size=1,
-            use_flash=True,
-        )
-        decode_math = run_decode_with_real_model(
-            model=model,
-            attention_layer=attention_layer,
-            kv_cache_len=context_length,
-            num_tokens=decode_tokens,
-            batch_size=1,
-            use_flash=False,
-        )
-    except Exception as e:
-        # Fallback to legacy mode if model loading fails
-        results["using_real_model"] = False
-        results["fallback_reason"] = str(e)[:100]
-        num_heads = config["q_heads"]
-        head_dim = config["head_dim"]
-        prefill_flash = simulate_prefill_attention(
-            batch_size=1, num_heads=num_heads, seq_len=context_length,
-            head_dim=head_dim, use_flash=True,
-        )
-        prefill_math = simulate_prefill_attention(
-            batch_size=1, num_heads=num_heads, seq_len=context_length,
-            head_dim=head_dim, use_flash=False,
-        )
-        decode_flash = simulate_decode_attention(
-            batch_size=1, num_heads=num_heads, kv_cache_len=context_length,
-            head_dim=head_dim, num_tokens=decode_tokens, use_flash=True,
-        )
-        decode_math = simulate_decode_attention(
-            batch_size=1, num_heads=num_heads, kv_cache_len=context_length,
-            head_dim=head_dim, num_tokens=decode_tokens, use_flash=False,
-        )
     results["prefill"] = {
         "flash": prefill_flash,
@@ -446,18 +668,27 @@ def run_prefill_decode_comparison(
         "math": decode_math,
     }
     # Create comparison chart
     comparison_chart = create_comparison_chart(results)
-    # Create KV cache growth chart
-    kv_cache_chart = create_kv_cache_chart(config, context_length, decode_tokens)
     # Generate insight
     insight = generate_phase_insight(results)
     # Add real model indicator to insight
-    if results.get("using_real_model"):
-        model_indicator = f"\n\n---\n\n*Benchmarked using real **{model_name}** attention layer ({attn_info['num_attention_heads']} heads, {attn_info['head_dim']}d)*"
         insight = insight + model_indicator
     return results, comparison_chart, kv_cache_chart, insight
@@ -558,16 +789,30 @@ def create_comparison_chart(results: dict) -> go.Figure:
     return fig
-def create_kv_cache_chart(config: dict, context_length: int, decode_tokens: int) -> go.Figure:
-    """Create chart showing KV cache growth during generation."""
-    kv_heads = config["kv_heads"]
-    head_dim = config["head_dim"]
-    num_layers = config["layers"]
     # Calculate KV cache size at each step
-    # KV cache per layer: 2 (K+V) × kv_heads × seq_len × head_dim × 2 (FP16 bytes)
-    bytes_per_token_per_layer = 2 * kv_heads * head_dim * 2
     total_bytes_per_token = bytes_per_token_per_layer * num_layers
     # Generate sequence of token counts
@@ -620,7 +865,7 @@ def create_kv_cache_chart(config: dict, context_length: int, decode_tokens: int)
     fig.update_layout(
         title=dict(
-            text=f"KV Cache Growth ({config.get('kv_heads', 'N/A')} KV heads × {num_layers} layers)",
             x=0.5,
         ),
         xaxis_title="Tokens Processed",
@@ -634,6 +879,7 @@ def create_kv_cache_chart(config: dict, context_length: int, decode_tokens: int)
             xanchor="center",
             x=0.5,
         ),
     )
     return fig

 )
+def get_real_model_config(model_name: str) -> dict:
+    """
+    Load model and extract ACTUAL config values from model.config.
+    This function ensures we use real model architecture values,
+    NOT hardcoded constants from MODEL_CONFIGS.
+    Args:
+        model_name: Key from MODEL_CONFIGS (e.g., "SmolLM2-360M")
+    Returns:
+        Dict with real model configuration values
+    """
+    model = load_model(model_name)
+    config = model.config
+    # Extract values directly from model.config
+    num_heads = config.num_attention_heads
+    num_kv_heads = getattr(config, 'num_key_value_heads', num_heads)
+    head_dim = config.hidden_size // num_heads
+    return {
+        "num_layers": config.num_hidden_layers,
+        "num_heads": num_heads,
+        "num_kv_heads": num_kv_heads,
+        "head_dim": head_dim,
+        "hidden_size": config.hidden_size,
+        "model_type": getattr(config, 'model_type', 'unknown'),
+        "gqa_ratio": num_heads // num_kv_heads if num_kv_heads > 0 else 1,
+    }
 def run_prefill_with_real_model(
     model,
     attention_layer,
     return result
+def run_prefill_benchmark(
+    model_name: str,
+    seq_len: int,
+    batch_size: int = 1,
+    num_iterations: int = 10,
+    use_flash: bool = True,
+) -> dict:
+    """
+    Benchmark prefill phase using F.scaled_dot_product_attention with REAL model dimensions.
+    This function uses the model's actual configuration (from model.config) to create
+    properly-sized Q, K, V tensors, then benchmarks the SDPA operation directly.
+    This is more reliable than calling attention layer forward() methods.
+    Args:
+        model_name: Key from MODEL_CONFIGS (model will be loaded to get real config)
+        seq_len: Sequence length (prompt tokens)
+        batch_size: Batch size
+        num_iterations: Number of timed iterations
+        use_flash: Whether to use FlashAttention backend
+    Returns:
+        Dict with time_ms, memory_mb, and status
+    """
+    if not torch.cuda.is_available():
+        return {"time_ms": 0, "memory_mb": 0, "status": "error: CUDA not available"}
+    device = torch.device("cuda")
+    dtype = torch.float16
+    try:
+        # Get REAL config from loaded model
+        real_config = get_real_model_config(model_name)
+        num_heads = real_config["num_heads"]
+        head_dim = real_config["head_dim"]
+        # Create Q, K, V tensors with REAL model dimensions
+        # Shape: [batch, num_heads, seq_len, head_dim]
+        Q = torch.randn(batch_size, num_heads, seq_len, head_dim, dtype=dtype, device=device)
+        K = torch.randn(batch_size, num_heads, seq_len, head_dim, dtype=dtype, device=device)
+        V = torch.randn(batch_size, num_heads, seq_len, head_dim, dtype=dtype, device=device)
+        # Set backend flags
+        if use_flash:
+            enable_math, enable_flash, enable_mem_efficient = False, True, False
+        else:
+            enable_math, enable_flash, enable_mem_efficient = True, False, False
+        # Warmup
+        for _ in range(3):
+            with torch.backends.cuda.sdp_kernel(
+                enable_flash=enable_flash,
+                enable_math=enable_math,
+                enable_mem_efficient=enable_mem_efficient
+            ):
+                _ = F.scaled_dot_product_attention(Q, K, V, is_causal=True)
+        torch.cuda.synchronize()
+        torch.cuda.reset_peak_memory_stats()
+        # Timed runs
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        for _ in range(num_iterations):
+            with torch.backends.cuda.sdp_kernel(
+                enable_flash=enable_flash,
+                enable_math=enable_math,
+                enable_mem_efficient=enable_mem_efficient
+            ):
+                output = F.scaled_dot_product_attention(Q, K, V, is_causal=True)
+        end.record()
+        torch.cuda.synchronize()
+        time_ms = start.elapsed_time(end) / num_iterations
+        memory_mb = torch.cuda.max_memory_allocated() / (1024 * 1024)
+        # Cleanup
+        del Q, K, V, output
+        torch.cuda.empty_cache()
+        return {
+            "time_ms": round(time_ms, 3),
+            "memory_mb": round(memory_mb, 1),
+            "seq_len": seq_len,
+            "phase": "prefill",
+            "backend": "flash" if use_flash else "math",
+            "num_heads": num_heads,
+            "head_dim": head_dim,
+            "status": "success",
+            "using_real_config": True,
+        }
+    except Exception as e:
+        return {
+            "time_ms": 0,
+            "memory_mb": 0,
+            "status": f"error: {str(e)[:100]}",
+            "phase": "prefill",
+        }
 def run_decode_with_real_model(
     model,
     attention_layer,
         }
+def run_decode_benchmark(
+    model_name: str,
+    kv_cache_len: int,
+    num_tokens: int = 10,
+    batch_size: int = 1,
+    num_iterations: int = 5,
+    use_flash: bool = True,
+) -> dict:
+    """
+    Benchmark decode phase using F.scaled_dot_product_attention with REAL model dimensions.
+    Properly simulates decode by:
+    - Creating single query token (Q with seq_len=1)
+    - Creating KV cache tensors with kv_cache_len tokens
+    - Handling GQA by expanding KV heads to match Q heads
+    Args:
+        model_name: Key from MODEL_CONFIGS (model will be loaded to get real config)
+        kv_cache_len: Length of KV cache (context length)
+        num_tokens: Number of decode tokens to simulate
+        batch_size: Batch size
+        num_iterations: Iterations for timing
+        use_flash: Whether to use FlashAttention backend
+    Returns:
+        Dict with time_ms_per_token, memory_mb, and status
+    """
+    if not torch.cuda.is_available():
+        return {"time_ms_per_token": 0, "memory_mb": 0, "status": "error: CUDA not available"}
+    device = torch.device("cuda")
+    dtype = torch.float16
+    try:
+        # Get REAL config from loaded model
+        real_config = get_real_model_config(model_name)
+        num_heads = real_config["num_heads"]
+        num_kv_heads = real_config["num_kv_heads"]
+        head_dim = real_config["head_dim"]
+        # Single query token: [batch, num_heads, 1, head_dim]
+        Q = torch.randn(batch_size, num_heads, 1, head_dim, dtype=dtype, device=device)
+        # KV cache with real model's KV head count: [batch, num_kv_heads, kv_cache_len, head_dim]
+        K_cache = torch.randn(batch_size, num_kv_heads, kv_cache_len, head_dim, dtype=dtype, device=device)
+        V_cache = torch.randn(batch_size, num_kv_heads, kv_cache_len, head_dim, dtype=dtype, device=device)
+        # Handle GQA: expand KV heads to match Q heads if needed
+        if num_kv_heads < num_heads:
+            repeat_factor = num_heads // num_kv_heads
+            K_cache = K_cache.repeat_interleave(repeat_factor, dim=1)
+            V_cache = V_cache.repeat_interleave(repeat_factor, dim=1)
+        # Set backend flags
+        if use_flash:
+            enable_math, enable_flash_flag, enable_mem_efficient = False, True, False
+        else:
+            enable_math, enable_flash_flag, enable_mem_efficient = True, False, False
+        # Warmup
+        for _ in range(3):
+            with torch.backends.cuda.sdp_kernel(
+                enable_flash=enable_flash_flag,
+                enable_math=enable_math,
+                enable_mem_efficient=enable_mem_efficient
+            ):
+                _ = F.scaled_dot_product_attention(Q, K_cache, V_cache)
+        torch.cuda.synchronize()
+        torch.cuda.reset_peak_memory_stats()
+        # Timed runs - simulate generating num_tokens
+        start = torch.cuda.Event(enable_timing=True)
+        end = torch.cuda.Event(enable_timing=True)
+        start.record()
+        for _ in range(num_tokens * num_iterations):
+            with torch.backends.cuda.sdp_kernel(
+                enable_flash=enable_flash_flag,
+                enable_math=enable_math,
+                enable_mem_efficient=enable_mem_efficient
+            ):
+                output = F.scaled_dot_product_attention(Q, K_cache, V_cache)
+        end.record()
+        torch.cuda.synchronize()
+        total_time_ms = start.elapsed_time(end)
+        time_per_token_ms = total_time_ms / (num_tokens * num_iterations)
+        memory_mb = torch.cuda.max_memory_allocated() / (1024 * 1024)
+        # Cleanup
+        del Q, K_cache, V_cache, output
+        torch.cuda.empty_cache()
+        return {
+            "time_ms_per_token": round(time_per_token_ms, 4),
+            "total_time_ms": round(total_time_ms / num_iterations, 3),
+            "memory_mb": round(memory_mb, 1),
+            "kv_cache_len": kv_cache_len,
+            "num_tokens": num_tokens,
+            "phase": "decode",
+            "backend": "flash" if use_flash else "math",
+            "num_heads": num_heads,
+            "num_kv_heads": num_kv_heads,
+            "head_dim": head_dim,
+            "status": "success",
+            "using_real_config": True,
+        }
+    except Exception as e:
+        return {
+            "time_ms_per_token": 0,
+            "total_time_ms": 0,
+            "memory_mb": 0,
+            "kv_cache_len": kv_cache_len,
+            "num_tokens": num_tokens,
+            "phase": "decode",
+            "status": f"error: {str(e)[:100]}",
+        }
 # Legacy function kept for backwards compatibility
 def simulate_prefill_attention(
     batch_size: int,
     """
     Run full comparison between prefill and decode phases using REAL HuggingFace model.
+    Uses F.scaled_dot_product_attention with real model dimensions for reliable benchmarking.
+    All config values come from model.config, not constants.
     Returns results dict, comparison chart, KV cache chart, and insight text.
     """
     if model_name not in MODEL_CONFIGS:
         return {"error": f"Unknown model: {model_name}"}, None, None, "Error: Unknown model"
+    # Get REAL config from model.config (not constants)
+    try:
+        real_config = get_real_model_config(model_name)
+    except Exception as e:
+        return {"error": f"Failed to load model: {str(e)[:50]}"}, None, None, f"Error: {str(e)[:50]}"
     results = {
         "model": model_name,
         "context_length": context_length,
         "decode_tokens": decode_tokens,
+        "real_config": real_config,
+        "using_real_config": True,
     }
+    # Run prefill benchmarks using SDPA with REAL model dimensions
+    prefill_flash = run_prefill_benchmark(
+        model_name=model_name,
+        seq_len=context_length,
+        batch_size=1,
+        use_flash=True,
+    )
+    prefill_math = run_prefill_benchmark(
+        model_name=model_name,
+        seq_len=context_length,
+        batch_size=1,
+        use_flash=False,
+    )
+    # Run decode benchmarks using SDPA with proper KV cache simulation
+    decode_flash = run_decode_benchmark(
+        model_name=model_name,
+        kv_cache_len=context_length,
+        num_tokens=decode_tokens,
+        batch_size=1,
+        use_flash=True,
+    )
+    decode_math = run_decode_benchmark(
+        model_name=model_name,
+        kv_cache_len=context_length,
+        num_tokens=decode_tokens,
+        batch_size=1,
+        use_flash=False,
+    )
     results["prefill"] = {
         "flash": prefill_flash,
         "math": decode_math,
     }
+    # Add model info for display
+    results["model_info"] = {
+        "num_heads": real_config["num_heads"],
+        "num_kv_heads": real_config["num_kv_heads"],
+        "head_dim": real_config["head_dim"],
+        "num_layers": real_config["num_layers"],
+        "gqa_ratio": real_config["gqa_ratio"],
+    }
     # Create comparison chart
     comparison_chart = create_comparison_chart(results)
+    # Create KV cache growth chart using REAL model config
+    kv_cache_chart = create_kv_cache_chart(model_name, context_length, decode_tokens)
     # Generate insight
     insight = generate_phase_insight(results)
     # Add real model indicator to insight
+    if results.get("using_real_config"):
+        model_indicator = f"\n\n---\n\n*Benchmarked using real **{model_name}** config ({real_config['num_heads']} heads, {real_config['head_dim']}d, GQA {real_config['gqa_ratio']}:1)*"
         insight = insight + model_indicator
     return results, comparison_chart, kv_cache_chart, insight
     return fig
+def create_kv_cache_chart(model_name: str, context_length: int, decode_tokens: int) -> go.Figure:
+    """
+    Create chart showing KV cache growth during generation.
+    Uses REAL model config values from model.config, not constants.
+    Args:
+        model_name: Model name to load config from
+        context_length: Number of context tokens (prefill)
+        decode_tokens: Number of decode tokens to generate
+    Returns:
+        Plotly figure showing KV cache growth
+    """
+    # Get REAL config from loaded model (no constants!)
+    real_config = get_real_model_config(model_name)
+    num_kv_heads = real_config["num_kv_heads"]
+    head_dim = real_config["head_dim"]
+    num_layers = real_config["num_layers"]
     # Calculate KV cache size at each step
+    # KV cache per layer: 2 (K+V) × kv_heads × head_dim × 2 (FP16 bytes)
+    bytes_per_token_per_layer = 2 * num_kv_heads * head_dim * 2
     total_bytes_per_token = bytes_per_token_per_layer * num_layers
     # Generate sequence of token counts
     fig.update_layout(
         title=dict(
+            text=f"KV Cache Growth ({num_kv_heads} KV heads × {num_layers} layers)",
             x=0.5,
         ),
         xaxis_title="Tokens Processed",
             xanchor="center",
             x=0.5,
         ),
+        yaxis=dict(rangemode='tozero'),
     )
     return fig