1ForrestW1
/

moondream3-endpoint

Forrest Wargo commited on Oct 6, 2025

Commit

6b2981b

1 Parent(s): 7cc210d

Disable FlexAttention decoding for endpoint robustness (avoid BlockMask error)

Files changed (1) hide show

handler.py CHANGED Viewed

@@ -81,6 +81,14 @@ class EndpointHandler:
         except Exception:
             pass
     def __call__(self, data: Dict[str, Any]) -> Any:
         # Accept HF toolkit shapes: { inputs: {...} } or JSON string
         if isinstance(data, dict) and "inputs" in data:

         except Exception:
             pass
+        # Prefer robustness over speed on HF endpoints: disable FlexAttention decoding
+        # to avoid BlockMask attribute mismatches in some torch builds.
+        try:
+            if hasattr(self.model, "model") and hasattr(self.model.model, "use_flex_decoding"):
+                self.model.model.use_flex_decoding = False
+        except Exception:
+            pass
     def __call__(self, data: Dict[str, Any]) -> Any:
         # Accept HF toolkit shapes: { inputs: {...} } or JSON string
         if isinstance(data, dict) and "inputs" in data: