Forrest Wargo commited on
Commit
6b2981b
·
1 Parent(s): 7cc210d

Disable FlexAttention decoding for endpoint robustness (avoid BlockMask error)

Browse files
Files changed (1) hide show
  1. handler.py +8 -0
handler.py CHANGED
@@ -81,6 +81,14 @@ class EndpointHandler:
81
  except Exception:
82
  pass
83
 
 
 
 
 
 
 
 
 
84
  def __call__(self, data: Dict[str, Any]) -> Any:
85
  # Accept HF toolkit shapes: { inputs: {...} } or JSON string
86
  if isinstance(data, dict) and "inputs" in data:
 
81
  except Exception:
82
  pass
83
 
84
+ # Prefer robustness over speed on HF endpoints: disable FlexAttention decoding
85
+ # to avoid BlockMask attribute mismatches in some torch builds.
86
+ try:
87
+ if hasattr(self.model, "model") and hasattr(self.model.model, "use_flex_decoding"):
88
+ self.model.model.use_flex_decoding = False
89
+ except Exception:
90
+ pass
91
+
92
  def __call__(self, data: Dict[str, Any]) -> Any:
93
  # Accept HF toolkit shapes: { inputs: {...} } or JSON string
94
  if isinstance(data, dict) and "inputs" in data: