1ForrestW1
/

moondream3-endpoint

Model card Files Files and versions

xet

Community

Forrest Wargo commited on Oct 9, 2025

Commit

eb3bdf4

1 Parent(s): 5dca6b2

unity v2

Browse files

Files changed (1) hide show

handler.py +47 -155

handler.py CHANGED Viewed

@@ -21,29 +21,19 @@ def _b64_to_pil(data_url: str) -> Image.Image:
 class EndpointHandler:
-    """HF Inference Endpoint handler for Moondream3 Preview.
-    Input contract (OpenAI-style):
     {
-      "messages": [
-        {
-          "role": "user",
-          "content": [
-            { "type": "image_url", "image_url": { "url": "data:<mime>;base64,<...>" } },
-            { "type": "text", "text": "<object or question>" }
-          ]
-        }
-      ],
-      "task": "point" | "detect" | "query"  // optional, default "point"
-      "max_objects": <int>                     // optional for detect
-      "reasoning": <bool>                      // optional for query
     }
-    Output:
-    - task=="point": { points: [{x, y}], width, height }
-    - task=="detect": { objects: [{x_min, y_min, x_max, y_max}], width, height }
-    - task=="query":  { answer: "...", width?, height? }
-    Coordinates are normalized (0-1). width/height echo source image dims for convenience.
     """
     def __init__(self, path: str = "") -> None:
@@ -105,33 +95,16 @@ class EndpointHandler:
                 except Exception:
                     pass
-        messages = data.get("messages")
-        task = str(data.get("task", "point")).lower()
-        reasoning = bool(data.get("reasoning", True))
-        max_objects = data.get("max_objects")
         prioritize_accuracy = bool(data.get("prioritize_accuracy", True))
-        if not messages:
-            return {"error": "Provide 'messages' with user image and text"}
-        # Extract first user image and text
-        image_data_url: Optional[str] = None
-        text_piece: Optional[str] = None
-        for msg in messages:
-            if msg.get("role") != "user":
-                return {"error": "Only user messages are supported."}
-            for part in msg.get("content", []):
-                if part.get("type") == "image_url" and image_data_url is None:
-                    image_data_url = part.get("image_url", {}).get("url")
-                elif part.get("type") == "text" and text_piece is None:
-                    text_piece = part.get("text")
-            if image_data_url and text_piece:
-                break
-        if not image_data_url or not isinstance(image_data_url, str) or not image_data_url.startswith("data:"):
-            return {"error": "image_url.url must be a data URL (data:...)"}
         if not text_piece:
-            return {"error": "Content must include text."}
         # Decode for dimensions and pass PIL to model
         try:
@@ -147,44 +120,32 @@ class EndpointHandler:
             except Exception:
                 pass
-        # Run selected skill
         try:
-            if task == "point":
-                if prioritize_accuracy:
-                    flipped = pil.transpose(Image.FLIP_LEFT_RIGHT)
-                    res_orig = self.model.point(pil, text_piece)
-                    res_flip = self.model.point(flipped, text_piece)
-                    points = self._tta_points(res_orig.get("points", []), res_flip.get("points", []))
-                    out: Dict[str, Any] = {"points": points}
-                else:
-                    result = self.model.point(pil, text_piece)
-                    out = {"points": result.get("points", [])}
-            elif task == "detect":
-                settings = {"max_objects": int(max_objects)} if max_objects else None
-                if prioritize_accuracy:
-                    flipped = pil.transpose(Image.FLIP_LEFT_RIGHT)
-                    res_orig = self.model.detect(pil, text_piece, settings=settings)
-                    res_flip = self.model.detect(flipped, text_piece, settings=settings)
-                    objects = self._tta_boxes(res_orig.get("objects", []), res_flip.get("objects", []))
-                    out = {"objects": objects}
-                else:
-                    result = self.model.detect(pil, text_piece, settings=settings)
-                    out = {"objects": result.get("objects", [])}
-            elif task == "query":
-                result = self.model.query(pil, question=text_piece, reasoning=reasoning, stream=False)
-                out = {"answer": result.get("answer", "")}
             else:
-                return {"error": f"Unsupported task '{task}'. Use 'point', 'detect', or 'query'."}
         except Exception as e:
             return {"error": f"Model inference failed: {e}"}
-        if width and height:
-            out.update({"width": width, "height": height})
-        out.update({"task": task})
         # Print prompt, dimensions, and raw output
         try:
-            print(f"[moondream-endpoint] Prompt: {text_piece}")
         except Exception:
             pass
         if width and height:
@@ -197,15 +158,17 @@ class EndpointHandler:
         except Exception:
             pass
-        # Ensure strict shape for point task: include points[] and raw
-        if task == "point":
-            # Ensure points array exists
-            if not isinstance(out.get("points"), list) or not out["points"]:
-                return {"error": "No points returned"}
-            # Attach raw for strict client and drop width/height from payload
-            return {"points": out["points"], "raw": out}
-        return out
     @staticmethod
     def _flip_point(p: Dict[str, Any]) -> Dict[str, float]:
@@ -244,77 +207,6 @@ class EndpointHandler:
         merged = list(points_a) + unflipped_b
         return cls._deduplicate_and_average_points(merged)
-    @staticmethod
-    def _flip_box(b: Dict[str, Any]) -> Dict[str, float]:
-        xmin = float(b.get("x_min", 0.0))
-        xmax = float(b.get("x_max", 0.0))
-        ymin = float(b.get("y_min", 0.0))
-        ymax = float(b.get("y_max", 0.0))
-        nxmin = 1.0 - xmax
-        nxmax = 1.0 - xmin
-        nxmin, nxmax = max(0.0, min(1.0, nxmin)), max(0.0, min(1.0, nxmax))
-        ymin, ymax = max(0.0, min(1.0, ymin)), max(0.0, min(1.0, ymax))
-        if nxmin > nxmax:
-            nxmin, nxmax = nxmax, nxmin
-        return {"x_min": nxmin, "y_min": ymin, "x_max": nxmax, "y_max": ymax}
-    @staticmethod
-    def _iou(b1: Dict[str, float], b2: Dict[str, float]) -> float:
-        x1 = max(b1["x_min"], b2["x_min"])
-        y1 = max(b1["y_min"], b2["y_min"])
-        x2 = min(b1["x_max"], b2["x_max"])
-        y2 = min(b1["y_max"], b2["y_max"])
-        inter_w = max(0.0, x2 - x1)
-        inter_h = max(0.0, y2 - y1)
-        inter = inter_w * inter_h
-        a1 = max(0.0, b1["x_max"] - b1["x_min"]) * max(0.0, b1["y_max"] - b1["y_min"])
-        a2 = max(0.0, b2["x_max"] - b2["x_min"]) * max(0.0, b2["y_max"] - b2["y_min"])
-        denom = a1 + a2 - inter
-        return inter / denom if denom > 0 else 0.0
-    @classmethod
-    def _merge_boxes_with_nms(cls, boxes: List[Dict[str, float]], iou_threshold: float = 0.5) -> List[Dict[str, float]]:
-        merged: List[Dict[str, float]] = []
-        used = [False] * len(boxes)
-        for i in range(len(boxes)):
-            if used[i]:
-                continue
-            cluster = [boxes[i]]
-            used[i] = True
-            for j in range(i + 1, len(boxes)):
-                if used[j]:
-                    continue
-                if cls._iou(boxes[i], boxes[j]) >= iou_threshold:
-                    used[j] = True
-                    cluster.append(boxes[j])
-            # Average cluster
-            n = float(len(cluster))
-            avg = {
-                "x_min": sum(b["x_min"] for b in cluster) / n,
-                "y_min": sum(b["y_min"] for b in cluster) / n,
-                "x_max": sum(b["x_max"] for b in cluster) / n,
-                "y_max": sum(b["y_max"] for b in cluster) / n,
-            }
-            # Clamp
-            avg["x_min"] = max(0.0, min(1.0, avg["x_min"]))
-            avg["y_min"] = max(0.0, min(1.0, avg["y_min"]))
-            avg["x_max"] = max(0.0, min(1.0, avg["x_max"]))
-            avg["y_max"] = max(0.0, min(1.0, avg["y_max"]))
-            merged.append(avg)
-        return merged
-    @classmethod
-    def _tta_boxes(cls, boxes_a: List[Dict[str, Any]], boxes_b_flipped: List[Dict[str, Any]]) -> List[Dict[str, float]]:
-        unflipped_b = [cls._flip_box(b) for b in boxes_b_flipped]
-        combined = [
-            {
-                "x_min": float(b.get("x_min", 0.0)),
-                "y_min": float(b.get("y_min", 0.0)),
-                "x_max": float(b.get("x_max", 0.0)),
-                "y_max": float(b.get("y_max", 0.0)),
-            }
-            for b in (list(boxes_a) + unflipped_b)
-        ]
-        return cls._merge_boxes_with_nms(combined, iou_threshold=0.5)

 class EndpointHandler:
+    """HF Inference Endpoint handler for Moondream3 Preview (point only).
+    Input contract (OpenAI-style, simplified):
     {
+      "system": "<system prompt>",
+      "user": "<user prompt>",
+      "image": "data:<mime>;base64,<...>",
+      "prioritize_accuracy": true | false // optional (default true)
     }
+    Output (point only):
+      { points: [{x, y}], raw: <debug payload> }
+    Coordinates are normalized [0,1].
     """
     def __init__(self, path: str = "") -> None:
                 except Exception:
                     pass
+        # New input contract: expect 'system', 'user', 'image' (point task only)
         prioritize_accuracy = bool(data.get("prioritize_accuracy", True))
+        system_prompt: Optional[str] = data.get("system")
+        text_piece: Optional[str] = data.get("user")
+        image_data_url: Optional[str] = data.get("image")
+        if not isinstance(image_data_url, str) or not image_data_url.startswith("data:"):
+            return {"error": "image must be a data URL (data:...)"}
         if not text_piece:
+            return {"error": "user text must be provided"}
         # Decode for dimensions and pass PIL to model
         try:
             except Exception:
                 pass
+        # Point-only inference
         try:
+            if prioritize_accuracy:
+                flipped = pil.transpose(Image.FLIP_LEFT_RIGHT)
+                res_orig = self.model.point(pil, text_piece)
+                res_flip = self.model.point(flipped, text_piece)
+                points = self._tta_points(res_orig.get("points", []), res_flip.get("points", []))
+                out: Dict[str, Any] = {"points": points}
             else:
+                result = self.model.point(pil, text_piece)
+                out = {"points": result.get("points", [])}
         except Exception as e:
             return {"error": f"Model inference failed: {e}"}
         # Print prompt, dimensions, and raw output
+        # Log prompts and timings
+        def _se(s: Optional[str], n: int = 120):
+            if not s:
+                return ("", "")
+            return (s[:n], s[-n:] if len(s) > n else s)
+        sys_start, sys_end = _se(system_prompt)
+        usr_start, usr_end = _se(text_piece)
         try:
+            print(f"[moondream-endpoint] System prompt (start): {sys_start}")
+            print(f"[moondream-endpoint] System prompt (end): {sys_end}")
+            print(f"[moondream-endpoint] User prompt (full): {text_piece}")
         except Exception:
             pass
         if width and height:
         except Exception:
             pass
+        # Ensure points array exists and normalized [0,1]
+        if not isinstance(out.get("points"), list) or not out["points"]:
+            return {"error": "No points returned"}
+        def _to_01(p):
+            x = float(p.get("x", 0.0))
+            y = float(p.get("y", 0.0))
+            if x > 1.0 or y > 1.0:
+                return {"x": x / 1000.0, "y": y / 1000.0}
+            return {"x": x, "y": y}
+        points_01 = [_to_01(p) for p in out["points"]]
+        return {"points": points_01, "raw": out}
     @staticmethod
     def _flip_point(p: Dict[str, Any]) -> Dict[str, float]:
         merged = list(points_a) + unflipped_b
         return cls._deduplicate_and_average_points(merged)
+    # Box-related utilities removed (endpoint is point-only)