Spaces:

atharv6f
/

flash-attention-explorer

Sleeping

a0y0346 commited on Feb 5

Commit

685194e

1 Parent(s): 374d38b

fix: Add fallback SDPA benchmark when attention layer fails

- Improved benchmark_attention_layer error handling with logging
- Added fallback to F.scaled_dot_product_attention using real model
dimensions when attention layer forward pass fails
- This ensures benchmarks still work with model's actual head
configuration even if layer-level benchmarking has issues

Files changed (2) hide show

src/attention_utils.py +22 -10
src/benchmark.py +106 -19

src/attention_utils.py CHANGED Viewed

@@ -143,6 +143,18 @@ def benchmark_attention_layer(
     enable_math, enable_flash, enable_mem_efficient = backend_flags[backend]
     try:
         # Warmup
         with torch.backends.cuda.sdp_kernel(
@@ -152,11 +164,7 @@ def benchmark_attention_layer(
         ):
             with torch.no_grad():
                 for _ in range(warmup_iterations):
-                    _ = attention_layer(
-                        hidden_states,
-                        position_ids=position_ids,
-                        attention_mask=attention_mask,
-                    )
         torch.cuda.synchronize()
         torch.cuda.reset_peak_memory_stats()
@@ -173,11 +181,7 @@ def benchmark_attention_layer(
             with torch.no_grad():
                 start.record()
                 for _ in range(num_iterations):
-                    output = attention_layer(
-                        hidden_states,
-                        position_ids=position_ids,
-                        attention_mask=attention_mask,
-                    )
                 end.record()
         torch.cuda.synchronize()
@@ -193,7 +197,10 @@ def benchmark_attention_layer(
         }
     except Exception as e:
         error_msg = str(e)
         # Common error: Flash attention not available on certain GPUs
         if "flash" in error_msg.lower() or "sm75" in error_msg.lower():
             return {
@@ -202,6 +209,11 @@ def benchmark_attention_layer(
                 "status": f"unsupported: {error_msg[:80]}",
                 "backend": backend,
             }
         return {
             "time_ms": None,
             "memory_mb": None,

     enable_math, enable_flash, enable_mem_efficient = backend_flags[backend]
+    def run_attention():
+        """Run attention with fallback for different call signatures."""
+        try:
+            # Try standard call with position_ids
+            return attention_layer(
+                hidden_states,
+                position_ids=position_ids,
+            )
+        except TypeError:
+            # Fallback: just hidden_states
+            return attention_layer(hidden_states)
     try:
         # Warmup
         with torch.backends.cuda.sdp_kernel(
         ):
             with torch.no_grad():
                 for _ in range(warmup_iterations):
+                    _ = run_attention()
         torch.cuda.synchronize()
         torch.cuda.reset_peak_memory_stats()
             with torch.no_grad():
                 start.record()
                 for _ in range(num_iterations):
+                    output = run_attention()
                 end.record()
         torch.cuda.synchronize()
         }
     except Exception as e:
+        import traceback
         error_msg = str(e)
+        tb = traceback.format_exc()
         # Common error: Flash attention not available on certain GPUs
         if "flash" in error_msg.lower() or "sm75" in error_msg.lower():
             return {
                 "status": f"unsupported: {error_msg[:80]}",
                 "backend": backend,
             }
+        # Log detailed error for debugging
+        print(f"[benchmark_attention_layer] Error for {backend}: {error_msg}")
+        print(f"[benchmark_attention_layer] Traceback: {tb[:500]}")
         return {
             "time_ms": None,
             "memory_mb": None,

src/benchmark.py CHANGED Viewed

@@ -190,41 +190,128 @@ def run_attention_benchmark(
     device = torch.device("cuda")
     dtype = torch.float16
-    # If model_name is provided, use real model attention layer
     if model_name is not None and model_name in MODEL_CONFIGS:
         try:
             # Load the real HuggingFace model
             model = load_model(model_name)
-            # Extract attention layer from layer 0
-            attention_layer = extract_attention_layer(model, layer_idx=0)
-            # Get model attention info
             attn_info = get_model_attention_info(model)
-            # Create proper inputs for the attention layer
-            hidden_states, position_ids = create_attention_inputs(
-                model, batch_size, seq_len, device, dtype
-            )
             results = {"model_name": model_name, "using_real_model": True}
             results["model_info"] = attn_info
-            # Benchmark each backend using the real attention layer
-            for backend in ["math", "flash", "mem_efficient"]:
-                result = benchmark_attention_layer(
                     attention_layer=attention_layer,
                     hidden_states=hidden_states,
                     position_ids=position_ids,
-                    backend=backend,
-                    num_iterations=num_iterations,
-                    warmup_iterations=warmup_iterations,
                 )
-                results[backend] = result
-            # Clean up inputs
-            del hidden_states, position_ids
-            torch.cuda.empty_cache()
             # Calculate speedups
             if results.get("math", {}).get("time_ms"):

     device = torch.device("cuda")
     dtype = torch.float16
+    # If model_name is provided, use real model dimensions for benchmarking
     if model_name is not None and model_name in MODEL_CONFIGS:
         try:
             # Load the real HuggingFace model
             model = load_model(model_name)
+            # Get model attention info for real dimensions
             attn_info = get_model_attention_info(model)
+            # Extract dimensions from real model
+            model_num_heads = attn_info["num_attention_heads"]
+            model_head_dim = attn_info["head_dim"]
             results = {"model_name": model_name, "using_real_model": True}
             results["model_info"] = attn_info
+            # First try: Use actual attention layer forward pass
+            attention_layer_works = False
+            try:
+                attention_layer = extract_attention_layer(model, layer_idx=0)
+                hidden_states, position_ids = create_attention_inputs(
+                    model, batch_size, seq_len, device, dtype
+                )
+                # Test if attention layer works with first backend
+                test_result = benchmark_attention_layer(
                     attention_layer=attention_layer,
                     hidden_states=hidden_states,
                     position_ids=position_ids,
+                    backend="flash",
+                    num_iterations=2,
+                    warmup_iterations=1,
                 )
+                if test_result.get("time_ms") is not None:
+                    attention_layer_works = True
+                del hidden_states, position_ids
+                torch.cuda.empty_cache()
+            except Exception as layer_error:
+                print(f"[run_attention_benchmark] Attention layer extraction failed: {layer_error}")
+                attention_layer_works = False
+            if attention_layer_works:
+                # Use actual attention layer
+                hidden_states, position_ids = create_attention_inputs(
+                    model, batch_size, seq_len, device, dtype
+                )
+                for backend in ["math", "flash", "mem_efficient"]:
+                    result = benchmark_attention_layer(
+                        attention_layer=attention_layer,
+                        hidden_states=hidden_states,
+                        position_ids=position_ids,
+                        backend=backend,
+                        num_iterations=num_iterations,
+                        warmup_iterations=warmup_iterations,
+                    )
+                    results[backend] = result
+                del hidden_states, position_ids
+                torch.cuda.empty_cache()
+            else:
+                # Fallback: Use F.scaled_dot_product_attention with real model dimensions
+                print(f"[run_attention_benchmark] Falling back to SDPA with model dimensions")
+                results["fallback_mode"] = True
+                # Create Q, K, V tensors with real model dimensions
+                Q = torch.randn(batch_size, model_num_heads, seq_len, model_head_dim, device=device, dtype=dtype)
+                K = torch.randn(batch_size, model_num_heads, seq_len, model_head_dim, device=device, dtype=dtype)
+                V = torch.randn(batch_size, model_num_heads, seq_len, model_head_dim, device=device, dtype=dtype)
+                backends = [
+                    ("math", True, False, False),
+                    ("flash", False, True, False),
+                    ("mem_efficient", False, False, True),
+                ]
+                for backend_name, enable_math, enable_flash, enable_mem_efficient in backends:
+                    try:
+                        torch.cuda.reset_peak_memory_stats()
+                        torch.cuda.synchronize()
+                        with torch.backends.cuda.sdp_kernel(
+                            enable_flash=enable_flash,
+                            enable_math=enable_math,
+                            enable_mem_efficient=enable_mem_efficient
+                        ):
+                            # Warmup
+                            for _ in range(warmup_iterations):
+                                _ = F.scaled_dot_product_attention(Q, K, V)
+                            torch.cuda.synchronize()
+                            # Timed runs
+                            start = torch.cuda.Event(enable_timing=True)
+                            end = torch.cuda.Event(enable_timing=True)
+                            start.record()
+                            for _ in range(num_iterations):
+                                _ = F.scaled_dot_product_attention(Q, K, V)
+                            end.record()
+                            torch.cuda.synchronize()
+                            time_ms = start.elapsed_time(end) / num_iterations
+                            memory_mb = torch.cuda.max_memory_allocated() / 1e6
+                        results[backend_name] = {
+                            "time_ms": round(time_ms, 3),
+                            "memory_mb": round(memory_mb, 1),
+                            "status": "success"
+                        }
+                    except Exception as e:
+                        results[backend_name] = {
+                            "time_ms": None,
+                            "memory_mb": None,
+                            "status": f"error: {str(e)[:50]}"
+                        }
+                del Q, K, V
+                torch.cuda.empty_cache()
             # Calculate speedups
             if results.get("math", {}).get("time_ms"):