Spaces:

H-Liu1997
/

FloodDiffusion-Streaming

Running on T4

App Files Files Community

H-Liu1997 commited on 9 days ago

Commit

bb7e158

1 Parent(s): 3042363

fix: patch flash_attention with SDPA fallback for T4 (no flash-attn)

Browse files

Files changed (1) hide show

model_manager.py +55 -1

model_manager.py CHANGED Viewed

@@ -87,11 +87,65 @@ class ModelManager:
         print("ModelManager initialized successfully")
     def _load_models(self, model_name):
         """Load VAE and diffusion models from HF Hub"""
         torch.set_float32_matmul_precision("high")
-        print(f"Loading model from HF Hub: {model_name}")
         from transformers import AutoModel
         hf_model = AutoModel.from_pretrained(model_name, trust_remote_code=True)

         print("ModelManager initialized successfully")
+    def _patch_attention_sdpa(self, model_name):
+        """Patch flash_attention() to include SDPA fallback for GPUs without flash-attn (e.g., T4)."""
+        import glob
+        import os
+        hf_cache = os.path.join(os.path.expanduser("~"), ".cache", "huggingface")
+        patterns = [
+            os.path.join(
+                hf_cache, "hub", "models--" + model_name.replace("/", "--"),
+                "snapshots", "*", "ldf_models", "tools", "attention.py",
+            ),
+            os.path.join(
+                hf_cache, "modules", "transformers_modules", model_name,
+                "*", "ldf_models", "tools", "attention.py",
+            ),
+        ]
+        target = '    assert q.device.type == "cuda" and q.size(-1) <= 256'
+        sdpa_fallback = target + "\n" + (
+            "\n"
+            "    # SDPA fallback when flash-attn is not available (e.g., T4 GPU)\n"
+            "    if not FLASH_ATTN_2_AVAILABLE and not FLASH_ATTN_3_AVAILABLE:\n"
+            "        if q_lens is not None or k_lens is not None:\n"
+            '            warnings.warn("Padding mask disabled with scaled_dot_product_attention")\n'
+            "        q = q.transpose(1, 2).to(dtype)\n"
+            "        k = k.transpose(1, 2).to(dtype)\n"
+            "        v = v.transpose(1, 2).to(dtype)\n"
+            "        out = torch.nn.functional.scaled_dot_product_attention(\n"
+            "            q, k, v, attn_mask=None, is_causal=causal, dropout_p=dropout_p\n"
+            "        )\n"
+            "        return out.transpose(1, 2).contiguous()\n"
+        )
+        for pattern in patterns:
+            for filepath in glob.glob(pattern):
+                with open(filepath, "r") as f:
+                    content = f.read()
+                if "SDPA fallback" in content:
+                    print(f"Already patched: {filepath}")
+                    continue
+                if target in content:
+                    content = content.replace(target, sdpa_fallback, 1)
+                    with open(filepath, "w") as f:
+                        f.write(content)
+                    print(f"Patched with SDPA fallback: {filepath}")
     def _load_models(self, model_name):
         """Load VAE and diffusion models from HF Hub"""
         torch.set_float32_matmul_precision("high")
+        # Pre-download model files to hub cache
+        print(f"Downloading model from HF Hub: {model_name}")
+        from huggingface_hub import snapshot_download
+        snapshot_download(model_name)
+        # Patch flash_attention with SDPA fallback for T4 (no flash-attn)
+        self._patch_attention_sdpa(model_name)
+        print("Loading model...")
         from transformers import AutoModel
         hf_model = AutoModel.from_pretrained(model_name, trust_remote_code=True)