Spaces:

TSXu
/

UniCalli_Dev

Running on Zero

TSXu commited on Jan 30

Commit

8fc8d44

1 Parent(s): dd7a8c2

Add Flash Attention 3 support (optional)

- Modified src/flux/math.py to support FA3 when USE_FA3=1
- Uses kernels library to load vllm-flash-attn3 from HuggingFace
- Registered as custom op for torch.export compatibility
- Falls back to PyTorch SDPA (FA2) when FA3 not available
- Added kernels to requirements.txt

To enable FA3: export USE_FA3=1

Files changed (2) hide show

requirements.txt +3 -0
src/flux/math.py +45 -2

requirements.txt CHANGED Viewed

@@ -30,3 +30,6 @@ pypinyin
 # Web UI (spaces handles torch 2.8+ AOT compilation)
 gradio>=5.0
 spaces>=0.47.0

 # Web UI (spaces handles torch 2.8+ AOT compilation)
 gradio>=5.0
 spaces>=0.47.0
+# Flash Attention 3 support (optional, for H100 GPUs)
+kernels

src/flux/math.py CHANGED Viewed

@@ -1,13 +1,56 @@
 import torch
 from einops import rearrange
 from torch import Tensor
 def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor) -> Tensor:
     q, k = apply_rope(q, k, pe)
-    x = torch.nn.functional.scaled_dot_product_attention(q, k, v)
-    x = rearrange(x, "B H L D -> B L (H D)")
     return x

 import torch
 from einops import rearrange
 from torch import Tensor
+from typing import Optional, List
+import os
+# ============================================================
+# Flash Attention 3 Support (for H100 GPUs)
+# ============================================================
+_USE_FA3 = os.environ.get("USE_FA3", "0") == "1"
+_flash_attn_func = None
+if _USE_FA3:
+    try:
+        from kernels import get_kernel
+        _fa3_kernel = get_kernel("kernels-community/vllm-flash-attn3")
+        _flash_attn_func_raw = _fa3_kernel.flash_attn_func
+        @torch.library.custom_op("flash::flash_attn_func", mutates_args=())
+        def _flash_attn_func(
+            q: torch.Tensor,
+            k: torch.Tensor,
+            v: torch.Tensor,
+            softmax_scale: Optional[float] = None,
+            causal: bool = False,
+        ) -> torch.Tensor:
+            outputs = _flash_attn_func_raw(q, k, v, softmax_scale=softmax_scale, causal=causal)
+            return outputs[0]
+        @_flash_attn_func.register_fake
+        def _(q, k, v, **kwargs):
+            return torch.empty_like(q).contiguous()
+        print("✓ Flash Attention 3 loaded successfully!")
+    except Exception as e:
+        print(f"Flash Attention 3 not available: {e}")
+        _USE_FA3 = False
 def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor) -> Tensor:
     q, k = apply_rope(q, k, pe)
+    if _USE_FA3 and _flash_attn_func is not None:
+        # FA3 expects (B, L, H, D) format
+        q_fa3 = rearrange(q, "B H L D -> B L H D")
+        k_fa3 = rearrange(k, "B H L D -> B L H D")
+        v_fa3 = rearrange(v, "B H L D -> B L H D")
+        x = _flash_attn_func(q_fa3, k_fa3, v_fa3)
+        x = rearrange(x, "B L H D -> B L (H D)")
+    else:
+        # Standard PyTorch SDPA (uses FA2 if available)
+        x = torch.nn.functional.scaled_dot_product_attention(q, k, v)
+        x = rearrange(x, "B H L D -> B L (H D)")
     return x