Forrest Wargo commited on
Commit ·
6b2981b
1
Parent(s): 7cc210d
Disable FlexAttention decoding for endpoint robustness (avoid BlockMask error)
Browse files- handler.py +8 -0
handler.py
CHANGED
|
@@ -81,6 +81,14 @@ class EndpointHandler:
|
|
| 81 |
except Exception:
|
| 82 |
pass
|
| 83 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 84 |
def __call__(self, data: Dict[str, Any]) -> Any:
|
| 85 |
# Accept HF toolkit shapes: { inputs: {...} } or JSON string
|
| 86 |
if isinstance(data, dict) and "inputs" in data:
|
|
|
|
| 81 |
except Exception:
|
| 82 |
pass
|
| 83 |
|
| 84 |
+
# Prefer robustness over speed on HF endpoints: disable FlexAttention decoding
|
| 85 |
+
# to avoid BlockMask attribute mismatches in some torch builds.
|
| 86 |
+
try:
|
| 87 |
+
if hasattr(self.model, "model") and hasattr(self.model.model, "use_flex_decoding"):
|
| 88 |
+
self.model.model.use_flex_decoding = False
|
| 89 |
+
except Exception:
|
| 90 |
+
pass
|
| 91 |
+
|
| 92 |
def __call__(self, data: Dict[str, Any]) -> Any:
|
| 93 |
# Accept HF toolkit shapes: { inputs: {...} } or JSON string
|
| 94 |
if isinstance(data, dict) and "inputs" in data:
|