1ForrestW1
/

gta1-endpoint

Forrest Wargo commited on Oct 6, 2025

Commit

cde6e20

1 Parent(s): 98e1622

Fallback to eager attention when FlashAttention2 is unavailable

Files changed (1) hide show

handler.py CHANGED Viewed

@@ -60,11 +60,22 @@ class EndpointHandler:
         if hub_token:
             load_kwargs["token"] = hub_token
-        self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
-            model_id,
-            attn_implementation="flash_attention_2",
-            **load_kwargs,
-        )
         self.processor = AutoProcessor.from_pretrained(
             model_id, trust_remote_code=True, token=hub_token
         )

         if hub_token:
             load_kwargs["token"] = hub_token
+        # Prefer FA2 when available; gracefully fall back if not installed on the image
+        try:
+            self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+                model_id,
+                attn_implementation="flash_attention_2",
+                **load_kwargs,
+            )
+        except Exception as e:
+            try:
+                print(f"[gta1-endpoint] FlashAttention2 unavailable, falling back to eager. Reason: {e}")
+            except Exception:
+                pass
+            self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
+                model_id,
+                **load_kwargs,
+            )
         self.processor = AutoProcessor.from_pretrained(
             model_id, trust_remote_code=True, token=hub_token
         )