Forrest Wargo commited on
Commit
cde6e20
·
1 Parent(s): 98e1622

Fallback to eager attention when FlashAttention2 is unavailable

Browse files
Files changed (1) hide show
  1. handler.py +16 -5
handler.py CHANGED
@@ -60,11 +60,22 @@ class EndpointHandler:
60
  if hub_token:
61
  load_kwargs["token"] = hub_token
62
 
63
- self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
64
- model_id,
65
- attn_implementation="flash_attention_2",
66
- **load_kwargs,
67
- )
 
 
 
 
 
 
 
 
 
 
 
68
  self.processor = AutoProcessor.from_pretrained(
69
  model_id, trust_remote_code=True, token=hub_token
70
  )
 
60
  if hub_token:
61
  load_kwargs["token"] = hub_token
62
 
63
+ # Prefer FA2 when available; gracefully fall back if not installed on the image
64
+ try:
65
+ self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
66
+ model_id,
67
+ attn_implementation="flash_attention_2",
68
+ **load_kwargs,
69
+ )
70
+ except Exception as e:
71
+ try:
72
+ print(f"[gta1-endpoint] FlashAttention2 unavailable, falling back to eager. Reason: {e}")
73
+ except Exception:
74
+ pass
75
+ self.model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
76
+ model_id,
77
+ **load_kwargs,
78
+ )
79
  self.processor = AutoProcessor.from_pretrained(
80
  model_id, trust_remote_code=True, token=hub_token
81
  )