Forrest Wargo commited on
Commit ·
cde6e20
1
Parent(s): 98e1622
Fallback to eager attention when FlashAttention2 is unavailable
Browse files- handler.py +16 -5
handler.py
CHANGED
|
@@ -60,11 +60,22 @@ class EndpointHandler:
|
|
| 60 |
if hub_token:
|
| 61 |
load_kwargs["token"] = hub_token
|
| 62 |
|
| 63 |
-
|
| 64 |
-
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 68 |
self.processor = AutoProcessor.from_pretrained(
|
| 69 |
model_id, trust_remote_code=True, token=hub_token
|
| 70 |
)
|
|
|
|
| 60 |
if hub_token:
|
| 61 |
load_kwargs["token"] = hub_token
|
| 62 |
|
| 63 |
+
# Prefer FA2 when available; gracefully fall back if not installed on the image
|
| 64 |
+
try:
|
| 65 |
+
self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
| 66 |
+
model_id,
|
| 67 |
+
attn_implementation="flash_attention_2",
|
| 68 |
+
**load_kwargs,
|
| 69 |
+
)
|
| 70 |
+
except Exception as e:
|
| 71 |
+
try:
|
| 72 |
+
print(f"[gta1-endpoint] FlashAttention2 unavailable, falling back to eager. Reason: {e}")
|
| 73 |
+
except Exception:
|
| 74 |
+
pass
|
| 75 |
+
self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
|
| 76 |
+
model_id,
|
| 77 |
+
**load_kwargs,
|
| 78 |
+
)
|
| 79 |
self.processor = AutoProcessor.from_pretrained(
|
| 80 |
model_id, trust_remote_code=True, token=hub_token
|
| 81 |
)
|