Spaces:

prithivMLmods
/

Qwen-Image-Edit-3D-Lighting-Control

Running on Zero

App Files Files Community

prithivMLmods commited on 9 days ago

Commit

3661ca3

1 Parent(s): 1aa5d2f

✅ Temporary fixes for FA3, which is currently broken; fallback to SDPA. (#2)

Browse files

- ✅ Temporary fixes for FA3, which is currently broken; fallback to SDPA. (255c75283331ec54cbc2afa4a94afb932a549e08)

Files changed (1) hide show

qwenimage/qwen_fa3_processor.py +151 -60

qwenimage/qwen_fa3_processor.py CHANGED Viewed

@@ -1,89 +1,183 @@
 """
 Paired with a good language model. Thanks!
 """
 import torch
 from typing import Optional, Tuple
 from diffusers.models.transformers.transformer_qwenimage import apply_rotary_emb_qwen
-try:
-    from kernels import get_kernel
-    _k = get_kernel("kernels-community/vllm-flash-attn3")
-    _flash_attn_func = _k.flash_attn_func
-except Exception as e:
-    _flash_attn_func = None
-    _kernels_err = e
-def _ensure_fa3_available():
-    if _flash_attn_func is None:
-        raise ImportError(
-            "FlashAttention-3 via Hugging Face `kernels` is required. "
-            "Tried `get_kernel('kernels-community/vllm-flash-attn3')` and failed with:\n"
-            f"{_kernels_err}"
         )
-@torch.library.custom_op("flash::flash_attn_func", mutates_args=())
-def flash_attn_func(
-    q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, causal: bool = False
 ) -> torch.Tensor:
-    outputs, lse = _flash_attn_func(q, k, v, causal=causal)
-    return outputs
-@flash_attn_func.register_fake
-def _(q, k, v, **kwargs):
-    # two outputs:
-    # 1. output: (batch, seq_len, num_heads, head_dim)
-    # 2. softmax_lse: (batch, num_heads, seq_len) with dtype=torch.float32
-    meta_q = torch.empty_like(q).contiguous()
-    return meta_q #, q.new_empty((q.size(0), q.size(2), q.size(1)), dtype=torch.float32)
 class QwenDoubleStreamAttnProcessorFA3:
     """
-    FA3-based attention processor for Qwen double-stream architecture.
-    Computes joint attention over concatenated [text, image] streams using vLLM FlashAttention-3
-    accessed via Hugging Face `kernels`.
-    Notes / limitations:
-    - General attention masks are not supported here (FA3 path). `is_causal=False` and no arbitrary mask.
-    - Optional windowed attention / sink tokens / softcap can be plumbed through if you use those features.
-    - Expects an available `apply_rotary_emb_qwen` in scope (same as your non-FA3 processor).
     """
-    _attention_backend = "fa3"  # for parity with your other processors, not used internally
     def __init__(self):
-        _ensure_fa3_available()
     @torch.no_grad()
     def __call__(
         self,
-        attn,  # Attention module with to_q/to_k/to_v/add_*_proj, norms, to_out, to_add_out, and .heads
-        hidden_states: torch.FloatTensor,                 # (B, S_img, D_model)  image stream
-        encoder_hidden_states: torch.FloatTensor = None,  # (B, S_txt, D_model)  text stream
-        encoder_hidden_states_mask: torch.FloatTensor = None,  # unused in FA3 path
-        attention_mask: Optional[torch.FloatTensor] = None,    # unused in FA3 path
-        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,  # (img_freqs, txt_freqs)
     ) -> Tuple[torch.FloatTensor, torch.FloatTensor]:
-        if encoder_hidden_states is None:
-            raise ValueError("QwenDoubleStreamAttnProcessorFA3 requires encoder_hidden_states (text stream).")
-        if attention_mask is not None:
-            # FA3 kernel path here does not consume arbitrary masks; fail fast to avoid silent correctness issues.
-            raise NotImplementedError("attention_mask is not supported in this FA3 implementation.")
-        _ensure_fa3_available()
         B, S_img, _ = hidden_states.shape
         S_txt = encoder_hidden_states.shape[1]
-        # ---- QKV projections (image/sample stream) ----
-        img_q = attn.to_q(hidden_states)   # (B, S_img, D)
         img_k = attn.to_k(hidden_states)
         img_v = attn.to_v(hidden_states)
-        # ---- QKV projections (text/context stream) ----
-        txt_q = attn.add_q_proj(encoder_hidden_states)  # (B, S_txt, D)
         txt_k = attn.add_k_proj(encoder_hidden_states)
         txt_v = attn.add_v_proj(encoder_hidden_states)
@@ -97,7 +191,7 @@ class QwenDoubleStreamAttnProcessorFA3:
         txt_k = txt_k.unflatten(-1, (H, -1))
         txt_v = txt_v.unflatten(-1, (H, -1))
-        # ---- Q/K normalization (per your module contract) ----
         if getattr(attn, "norm_q", None) is not None:
             img_q = attn.norm_q(img_q)
         if getattr(attn, "norm_k", None) is not None:
@@ -110,25 +204,22 @@ class QwenDoubleStreamAttnProcessorFA3:
         # ---- RoPE (Qwen variant) ----
         if image_rotary_emb is not None:
             img_freqs, txt_freqs = image_rotary_emb
-            # expects tensors shaped (B, S, H, D_h)
             img_q = apply_rotary_emb_qwen(img_q, img_freqs, use_real=False)
             img_k = apply_rotary_emb_qwen(img_k, img_freqs, use_real=False)
             txt_q = apply_rotary_emb_qwen(txt_q, txt_freqs, use_real=False)
             txt_k = apply_rotary_emb_qwen(txt_k, txt_freqs, use_real=False)
         # ---- Joint attention over [text, image] along sequence axis ----
-        # Shapes: (B, S_total, H, D_h)
-        q = torch.cat([txt_q, img_q], dim=1)
         k = torch.cat([txt_k, img_k], dim=1)
         v = torch.cat([txt_v, img_v], dim=1)
-        # FlashAttention-3 path expects (B, S, H, D_h) and returns (out, softmax_lse)
-        out = flash_attn_func(q, k, v, causal=False)  # out: (B, S_total, H, D_h)
         # ---- Back to (B, S, D_model) ----
         out = out.flatten(2, 3).to(q.dtype)
-        # Split back to text / image segments
         txt_attn_out = out[:, :S_txt, :]
         img_attn_out = out[:, S_txt:, :]

 """
 Paired with a good language model. Thanks!
+FA3 is currently broken on Blackwell (sm_100) GPUs; this module detects that
+at import time and falls back to PyTorch scaled-dot-product attention (SDPA)
+automatically.  The public class name / call signature are unchanged.
 """
 import torch
+import torch.nn.functional as F
 from typing import Optional, Tuple
 from diffusers.models.transformers.transformer_qwenimage import apply_rotary_emb_qwen
+# ---------------------------------------------------------------------------
+# FA3 availability check
+# ---------------------------------------------------------------------------
+def _is_blackwell() -> bool:
+    """Return True when the current default CUDA device is an sm_100 (Blackwell) GPU."""
+    if not torch.cuda.is_available():
+        return False
+    cap = torch.cuda.get_device_capability()
+    # Blackwell → compute capability 10.x  (sm_100)
+    return cap[0] >= 10
+_fa3_available: bool = False
+_fa3_unavailable_reason: str = ""
+_flash_attn_func = None
+if _is_blackwell():
+    _fa3_unavailable_reason = (
+        "FlashAttention-3 is not yet supported on Blackwell (sm_100) GPUs. "
+        "Falling back to scaled-dot-product attention (SDPA)."
+    )
+else:
+    try:
+        from kernels import get_kernel
+        _k = get_kernel("kernels-community/vllm-flash-attn3")
+        _flash_attn_func = _k.flash_attn_func
+        _fa3_available = True
+    except Exception as e:
+        _fa3_unavailable_reason = (
+            "FlashAttention-3 via Hugging Face `kernels` is unavailable. "
+            f"Tried `get_kernel('kernels-community/vllm-flash-attn3')` and failed with:\n{e}\n"
+            "Falling back to scaled-dot-product attention (SDPA)."
         )
+# ---------------------------------------------------------------------------
+# FA3 custom op (registered only when the kernel loaded successfully)
+# ---------------------------------------------------------------------------
+if _fa3_available:
+    @torch.library.custom_op("flash::flash_attn_func", mutates_args=())
+    def flash_attn_func(
+        q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, causal: bool = False
+    ) -> torch.Tensor:
+        # _flash_attn_func returns (output, softmax_lse); we only need output.
+        output, _lse = _flash_attn_func(q, k, v, causal=causal)
+        return output
+    @flash_attn_func.register_fake
+    def _flash_attn_func_fake(q, k, v, causal=False):
+        # output shape mirrors q: (batch, seq_len, num_heads, head_dim)
+        return torch.empty_like(q).contiguous()
+else:
+    # Provide a stub so call-sites that import the symbol don't break at
+    # module load; the processor will route around it at runtime.
+    def flash_attn_func(
+        q: torch.Tensor, k: torch.Tensor, v: torch.Tensor, causal: bool = False
+    ) -> torch.Tensor:
+        raise RuntimeError(_fa3_unavailable_reason)
+# ---------------------------------------------------------------------------
+# SDPA fallback helper
+# ---------------------------------------------------------------------------
+def _sdpa_attention(
+    q: torch.Tensor,
+    k: torch.Tensor,
+    v: torch.Tensor,
+    causal: bool = False,
 ) -> torch.Tensor:
+    """
+    Scaled dot-product attention using torch.nn.functional.scaled_dot_product_attention.
+    Input / output layout: (B, S, H, D_h)  — same as the FA3 kernel.
+    """
+    # SDPA expects (B, H, S, D_h)
+    q = q.transpose(1, 2)
+    k = k.transpose(1, 2)
+    v = v.transpose(1, 2)
+    out = F.scaled_dot_product_attention(q, k, v, is_causal=causal)
+    # Back to (B, S, H, D_h)
+    return out.transpose(1, 2)
+# ---------------------------------------------------------------------------
+# Attention processor
+# ---------------------------------------------------------------------------
 class QwenDoubleStreamAttnProcessorFA3:
     """
+    Attention processor for the Qwen double-stream architecture.
+    Preferred backend: vLLM FlashAttention-3 via Hugging Face ``kernels``.
+    Automatic fallback: PyTorch ``scaled_dot_product_attention`` (SDPA) when
+    FA3 is unavailable — e.g. on Blackwell (sm_100) GPUs where FA3 is not yet
+    supported, or when the ``kernels`` package is absent.
+    Notes / limitations
+    -------------------
+    - Arbitrary attention masks are not supported on the FA3 path.  Pass
+      ``attention_mask=None`` (the default) to stay on the fast path.
+    - On the SDPA path, ``attention_mask`` is likewise ignored; add explicit
+      support here if you need it.
+    - ``encoder_hidden_states`` (text stream) is required.
     """
+    _attention_backend: str  # set in __init__ after capability detection
     def __init__(self):
+        if _fa3_available:
+            self._attention_backend = "fa3"
+        else:
+            import warnings
+            warnings.warn(
+                f"QwenDoubleStreamAttnProcessorFA3: {_fa3_unavailable_reason}",
+                stacklevel=2,
+            )
+            self._attention_backend = "sdpa"
+    def _attend(
+        self,
+        q: torch.Tensor,
+        k: torch.Tensor,
+        v: torch.Tensor,
+        causal: bool = False,
+    ) -> torch.Tensor:
+        """Dispatch to FA3 or SDPA depending on what is available."""
+        if self._attention_backend == "fa3":
+            return flash_attn_func(q, k, v, causal=causal)
+        return _sdpa_attention(q, k, v, causal=causal)
     @torch.no_grad()
     def __call__(
         self,
+        attn,
+        hidden_states: torch.FloatTensor,                          # (B, S_img, D_model)
+        encoder_hidden_states: torch.FloatTensor = None,           # (B, S_txt, D_model)
+        encoder_hidden_states_mask: torch.FloatTensor = None,      # unused
+        attention_mask: Optional[torch.FloatTensor] = None,        # unsupported on FA3 path
+        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
     ) -> Tuple[torch.FloatTensor, torch.FloatTensor]:
+        if encoder_hidden_states is None:
+            raise ValueError(
+                "QwenDoubleStreamAttnProcessorFA3 requires encoder_hidden_states (text stream)."
+            )
+        if attention_mask is not None and self._attention_backend == "fa3":
+            raise NotImplementedError(
+                "attention_mask is not supported on the FA3 path. "
+                "Either drop the mask or let the processor fall back to SDPA."
+            )
         B, S_img, _ = hidden_states.shape
         S_txt = encoder_hidden_states.shape[1]
+        # ---- QKV projections ----
+        img_q = attn.to_q(hidden_states)
         img_k = attn.to_k(hidden_states)
         img_v = attn.to_v(hidden_states)
+        txt_q = attn.add_q_proj(encoder_hidden_states)
         txt_k = attn.add_k_proj(encoder_hidden_states)
         txt_v = attn.add_v_proj(encoder_hidden_states)
         txt_k = txt_k.unflatten(-1, (H, -1))
         txt_v = txt_v.unflatten(-1, (H, -1))
+        # ---- Q/K normalization ----
         if getattr(attn, "norm_q", None) is not None:
             img_q = attn.norm_q(img_q)
         if getattr(attn, "norm_k", None) is not None:
         # ---- RoPE (Qwen variant) ----
         if image_rotary_emb is not None:
             img_freqs, txt_freqs = image_rotary_emb
             img_q = apply_rotary_emb_qwen(img_q, img_freqs, use_real=False)
             img_k = apply_rotary_emb_qwen(img_k, img_freqs, use_real=False)
             txt_q = apply_rotary_emb_qwen(txt_q, txt_freqs, use_real=False)
             txt_k = apply_rotary_emb_qwen(txt_k, txt_freqs, use_real=False)
         # ---- Joint attention over [text, image] along sequence axis ----
+        q = torch.cat([txt_q, img_q], dim=1)  # (B, S_txt + S_img, H, D_h)
         k = torch.cat([txt_k, img_k], dim=1)
         v = torch.cat([txt_v, img_v], dim=1)
+        out = self._attend(q, k, v, causal=False)  # (B, S_total, H, D_h)
         # ---- Back to (B, S, D_model) ----
         out = out.flatten(2, 3).to(q.dtype)
+        # ---- Split text / image segments ----
         txt_attn_out = out[:, :S_txt, :]
         img_attn_out = out[:, S_txt:, :]