Spaces:
Running
Running
| """Output parsing for LocateAnything-3B bounding box responses.""" | |
| from __future__ import annotations | |
| import re | |
| from dataclasses import dataclass, field | |
| from typing import Any | |
| from src.config import COORD_MAX, DEFAULT_CONFIDENCE | |
| class BBox: | |
| """A parsed bounding box in pixel coordinates.""" | |
| x1: float | |
| y1: float | |
| x2: float | |
| y2: float | |
| confidence: float = DEFAULT_CONFIDENCE | |
| label: str = "" | |
| def width(self) -> float: | |
| return max(0.0, self.x2 - self.x1) | |
| def height(self) -> float: | |
| return max(0.0, self.y2 - self.y1) | |
| def area(self) -> float: | |
| return self.width * self.height | |
| def center(self) -> tuple[float, float]: | |
| return ((self.x1 + self.x2) / 2, (self.y1 + self.y2) / 2) | |
| def is_valid(self, img_w: int, img_h: int) -> bool: | |
| """Check if box is within image bounds and has positive area.""" | |
| return ( | |
| self.x1 >= 0 | |
| and self.y1 >= 0 | |
| and self.x2 <= img_w + 1 | |
| and self.y2 <= img_h + 1 | |
| and self.width > 1 | |
| and self.height > 1 | |
| ) | |
| def clamp(self, img_w: int, img_h: int) -> BBox: | |
| """Return a clamped copy within image bounds.""" | |
| return BBox( | |
| x1=max(0, min(self.x1, img_w)), | |
| y1=max(0, min(self.y1, img_h)), | |
| x2=max(0, min(self.x2, img_w)), | |
| y2=max(0, min(self.y2, img_h)), | |
| confidence=self.confidence, | |
| label=self.label, | |
| ) | |
| def to_dict(self) -> dict[str, Any]: | |
| return { | |
| "x1": round(self.x1, 2), | |
| "y1": round(self.y1, 2), | |
| "x2": round(self.x2, 2), | |
| "y2": round(self.y2, 2), | |
| "width": round(self.width, 2), | |
| "height": round(self.height, 2), | |
| "confidence": self.confidence, | |
| "label": self.label, | |
| } | |
| class ParseResult: | |
| """Structured result from parsing model output.""" | |
| boxes: list[BBox] = field(default_factory=list) | |
| raw_output: str = "" | |
| parse_errors: list[str] = field(default_factory=list) | |
| def num_detections(self) -> int: | |
| return len(self.boxes) | |
| def to_dict(self) -> dict[str, Any]: | |
| return { | |
| "num_detections": self.num_detections, | |
| "boxes": [b.to_dict() for b in self.boxes], | |
| "raw_output": self.raw_output, | |
| "parse_errors": self.parse_errors, | |
| } | |
| BOX_PATTERN_4 = re.compile(r"<box><(\d+)><(\d+)><(\d+)><(\d+)></box>") | |
| BOX_PATTERN_4_ALT = re.compile(r"<box>\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*</box>") | |
| BOX_PATTERN_2 = re.compile(r"<box><(\d+)><(\d+)></box>") | |
| def _norm_to_pixel(val: int, scale: int) -> float: | |
| """Convert normalized [0, 1000] coordinate to pixel coordinate.""" | |
| return val / COORD_MAX * scale | |
| def parse_boxes( | |
| raw_output: str, | |
| image_width: int, | |
| image_height: int, | |
| ) -> ParseResult: | |
| """Parse model output into structured bounding boxes. | |
| The model outputs coordinates normalized to [0, 1000]. | |
| This function converts them to pixel coordinates. | |
| """ | |
| result = ParseResult(raw_output=raw_output) | |
| seen: set[tuple[float, float, float, float]] = set() | |
| for match in BOX_PATTERN_4.finditer(raw_output): | |
| try: | |
| x1 = _norm_to_pixel(int(match.group(1)), image_width) | |
| y1 = _norm_to_pixel(int(match.group(2)), image_height) | |
| x2 = _norm_to_pixel(int(match.group(3)), image_width) | |
| y2 = _norm_to_pixel(int(match.group(4)), image_height) | |
| key = (round(x1, 1), round(y1, 1), round(x2, 1), round(y2, 1)) | |
| if key not in seen: | |
| seen.add(key) | |
| box = BBox(x1=x1, y1=y1, x2=x2, y2=y2) | |
| if box.is_valid(image_width, image_height): | |
| result.boxes.append(box) | |
| else: | |
| result.parse_errors.append(f"Out-of-bounds box discarded: {key}") | |
| except (ValueError, IndexError) as exc: | |
| result.parse_errors.append(f"Failed to parse box: {exc}") | |
| if not result.boxes: | |
| for match in BOX_PATTERN_4_ALT.finditer(raw_output): | |
| try: | |
| x1 = _norm_to_pixel(int(match.group(1)), image_width) | |
| y1 = _norm_to_pixel(int(match.group(2)), image_height) | |
| x2 = _norm_to_pixel(int(match.group(3)), image_width) | |
| y2 = _norm_to_pixel(int(match.group(4)), image_height) | |
| key = (round(x1, 1), round(y1, 1), round(x2, 1), round(y2, 1)) | |
| if key not in seen: | |
| seen.add(key) | |
| box = BBox(x1=x1, y1=y1, x2=x2, y2=y2) | |
| if box.is_valid(image_width, image_height): | |
| result.boxes.append(box) | |
| except (ValueError, IndexError) as exc: | |
| result.parse_errors.append(f"Failed to parse alt box: {exc}") | |
| return result | |
| def parse_points( | |
| raw_output: str, | |
| image_width: int, | |
| image_height: int, | |
| ) -> list[dict[str, float]]: | |
| """Parse model output into pixel-coordinate points.""" | |
| points = [] | |
| for match in BOX_PATTERN_2.finditer(raw_output): | |
| try: | |
| x = _norm_to_pixel(int(match.group(1)), image_width) | |
| y = _norm_to_pixel(int(match.group(2)), image_height) | |
| points.append({"x": round(x, 2), "y": round(y, 2)}) | |
| except (ValueError, IndexError): | |
| pass | |
| return points | |