1ForrestW1
/

omniparser-endpoint-2

Transformers

Safetensors

endpoint-template

custom_code

Model card Files Files and versions

xet

Community

Forrest Wargo commited on Oct 6

Commit

416a2e8

1 Parent(s): 6600256

Accept OpenAI-style input; return annotated_image+boxes

Browse files

Files changed (1) hide show

handler.py +75 -17

handler.py CHANGED Viewed

@@ -49,17 +49,48 @@ class EndpointHandler:
         self.annotator = BoxAnnotator()
     def __call__(self, data: Dict[str, Any]) -> Any:
-        # data should contain the following:
-        #  "inputs": {
-        #    "image": url/base64,
-        #    (optional) "image_size": {"w": int, "h": int},
-        #    (optional) "bbox_threshold": float,
-        #    (optional) "iou_threshold": float,
-        #  }
-        data = data.pop("inputs")
-        # read image from either url or base64 encoding
-        image = load_image(data["image"])
         ocr_texts, ocr_bboxes = self.check_ocr_bboxes(
             image,
@@ -68,17 +99,44 @@ class EndpointHandler:
         )
         annotated_image, filtered_bboxes_out = self.get_som_labeled_img(
             image,
-            image_size=data.get("image_size", None),
             ocr_texts=ocr_texts,
             ocr_bboxes=ocr_bboxes,
-            bbox_threshold=data.get("bbox_threshold", 0.05),
-            iou_threshold=data.get("iou_threshold", None),
         )
-        return {
-            "image": annotated_image,
-            "bboxes": filtered_bboxes_out,
         }
     def check_ocr_bboxes(
         self,
         image: ImageType,

         self.annotator = BoxAnnotator()
     def __call__(self, data: Dict[str, Any]) -> Any:
+        # Flexible input contract:
+        # 1) OpenAI-style: { messages: [ { role:'user', content:[ {type:'image_url', image_url:{url:data-url}}, {type:'text', text:'...'} ] } ] }
+        # 2) Legacy HF inputs: { inputs: { image: <url|data-url> } }
+        # 3) PropParse-style: { inputs: { image_b64: <data-url>, ... } }
+        # Normalize payload
+        payload: Dict[str, Any]
+        if isinstance(data, dict) and "inputs" in data:
+            payload = data.get("inputs") or {}
+        else:
+            payload = data
+        # Extract image source
+        img_source: Optional[str] = None
+        if "image_b64" in payload and isinstance(payload["image_b64"], str):
+            img_source = payload["image_b64"]
+        elif "image" in payload and isinstance(payload["image"], str):
+            img_source = payload["image"]
+        elif isinstance(payload.get("messages"), list):
+            for msg in payload["messages"]:
+                if isinstance(msg, Dict) and msg.get("role") == "user":
+                    for part in msg.get("content", []):
+                        if part.get("type") == "image_url":
+                            img_source = part.get("image_url", {}).get("url")
+                            break
+                if img_source:
+                    break
+        if not img_source:
+            return {"error": "No image provided (image/image_b64/messages)"}
+        # Load image from data URL or external URL
+        try:
+            if isinstance(img_source, str) and img_source.startswith("data:"):
+                header, b64data = img_source.split(",", 1)
+                decoded = base64.b64decode(b64data)
+                image = Image.open(io.BytesIO(decoded))
+                image.load()
+            else:
+                image = load_image(img_source)
+        except Exception as e:
+            return {"error": f"Failed to load image: {e}"}
         ocr_texts, ocr_bboxes = self.check_ocr_bboxes(
             image,
         )
         annotated_image, filtered_bboxes_out = self.get_som_labeled_img(
             image,
+            image_size=payload.get("image_size", None),
             ocr_texts=ocr_texts,
             ocr_bboxes=ocr_bboxes,
+            bbox_threshold=payload.get("bbox_threshold", 0.05),
+            iou_threshold=payload.get("iou_threshold", None),
         )
+        # Legacy fields
+        legacy = {"image": annotated_image, "bboxes": filtered_bboxes_out}
+        # PropParse-style fields
+        try:
+            w, h = image.size  # type: ignore
+        except Exception:
+            w, h = None, None
+        annotated_data_url = (
+            f"data:image/png;base64,{annotated_image}"
+            if isinstance(annotated_image, str) and not annotated_image.startswith("data:")
+            else annotated_image
+        )
+        elements = [
+            {
+                "type": box.get("type", "icon"),
+                "bbox_xyxy_norm": box.get("bbox"),
+                "interactivity": box.get("interactivity", True),
+                "content": box.get("content"),
+            }
+            for box in filtered_bboxes_out
+            if isinstance(box.get("bbox"), list)
+        ]
+        propparse_style = {
+            "annotated_image": annotated_data_url,
+            "boxes": {"elements": elements},
+            **({"width": w, "height": h} if w and h else {}),
         }
+        return {**legacy, **propparse_style}
     def check_ocr_bboxes(
         self,
         image: ImageType,