Spaces:

H-Liu1997
/

FloodDiffusion-Streaming

Running on T4

App Files Files Community

H-Liu1997 commited on 9 days ago

Commit

e843211

1 Parent(s): bb7e158

fix: cast SDPA output back to original dtype + idempotent patching

Browse files

Files changed (1) hide show

model_manager.py +13 -4

model_manager.py CHANGED Viewed

@@ -104,11 +104,18 @@ class ModelManager:
             ),
         ]
-        target = '    assert q.device.type == "cuda" and q.size(-1) <= 256'
-        sdpa_fallback = target + "\n" + (
             "\n"
             "    # SDPA fallback when flash-attn is not available (e.g., T4 GPU)\n"
             "    if not FLASH_ATTN_2_AVAILABLE and not FLASH_ATTN_3_AVAILABLE:\n"
             "        if q_lens is not None or k_lens is not None:\n"
             '            warnings.warn("Padding mask disabled with scaled_dot_product_attention")\n'
             "        q = q.transpose(1, 2).to(dtype)\n"
@@ -117,7 +124,9 @@ class ModelManager:
             "        out = torch.nn.functional.scaled_dot_product_attention(\n"
             "            q, k, v, attn_mask=None, is_causal=causal, dropout_p=dropout_p\n"
             "        )\n"
-            "        return out.transpose(1, 2).contiguous()\n"
         )
         for pattern in patterns:
@@ -128,7 +137,7 @@ class ModelManager:
                     print(f"Already patched: {filepath}")
                     continue
                 if target in content:
-                    content = content.replace(target, sdpa_fallback, 1)
                     with open(filepath, "w") as f:
                         f.write(content)
                     print(f"Patched with SDPA fallback: {filepath}")

             ),
         ]
+        # Use the assert + next line as target to ensure idempotent patching
+        target = (
+            '    assert q.device.type == "cuda" and q.size(-1) <= 256\n'
+            "\n"
+            "    # params\n"
+        )
+        replacement = (
+            '    assert q.device.type == "cuda" and q.size(-1) <= 256\n'
             "\n"
             "    # SDPA fallback when flash-attn is not available (e.g., T4 GPU)\n"
             "    if not FLASH_ATTN_2_AVAILABLE and not FLASH_ATTN_3_AVAILABLE:\n"
+            "        out_dtype = q.dtype\n"
             "        if q_lens is not None or k_lens is not None:\n"
             '            warnings.warn("Padding mask disabled with scaled_dot_product_attention")\n'
             "        q = q.transpose(1, 2).to(dtype)\n"
             "        out = torch.nn.functional.scaled_dot_product_attention(\n"
             "            q, k, v, attn_mask=None, is_causal=causal, dropout_p=dropout_p\n"
             "        )\n"
+            "        return out.transpose(1, 2).contiguous().to(out_dtype)\n"
+            "\n"
+            "    # params\n"
         )
         for pattern in patterns:
                     print(f"Already patched: {filepath}")
                     continue
                 if target in content:
+                    content = content.replace(target, replacement, 1)
                     with open(filepath, "w") as f:
                         f.write(content)
                     print(f"Patched with SDPA fallback: {filepath}")