File size: 5,517 Bytes
23db765
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
"""Output parsing for LocateAnything-3B bounding box responses."""

from __future__ import annotations

import re
from dataclasses import dataclass, field
from typing import Any

from src.config import COORD_MAX, DEFAULT_CONFIDENCE


@dataclass
class BBox:
    """A parsed bounding box in pixel coordinates."""

    x1: float
    y1: float
    x2: float
    y2: float
    confidence: float = DEFAULT_CONFIDENCE
    label: str = ""

    @property
    def width(self) -> float:
        return max(0.0, self.x2 - self.x1)

    @property
    def height(self) -> float:
        return max(0.0, self.y2 - self.y1)

    @property
    def area(self) -> float:
        return self.width * self.height

    @property
    def center(self) -> tuple[float, float]:
        return ((self.x1 + self.x2) / 2, (self.y1 + self.y2) / 2)

    def is_valid(self, img_w: int, img_h: int) -> bool:
        """Check if box is within image bounds and has positive area."""
        return (
            self.x1 >= 0
            and self.y1 >= 0
            and self.x2 <= img_w + 1
            and self.y2 <= img_h + 1
            and self.width > 1
            and self.height > 1
        )

    def clamp(self, img_w: int, img_h: int) -> BBox:
        """Return a clamped copy within image bounds."""
        return BBox(
            x1=max(0, min(self.x1, img_w)),
            y1=max(0, min(self.y1, img_h)),
            x2=max(0, min(self.x2, img_w)),
            y2=max(0, min(self.y2, img_h)),
            confidence=self.confidence,
            label=self.label,
        )

    def to_dict(self) -> dict[str, Any]:
        return {
            "x1": round(self.x1, 2),
            "y1": round(self.y1, 2),
            "x2": round(self.x2, 2),
            "y2": round(self.y2, 2),
            "width": round(self.width, 2),
            "height": round(self.height, 2),
            "confidence": self.confidence,
            "label": self.label,
        }


@dataclass
class ParseResult:
    """Structured result from parsing model output."""

    boxes: list[BBox] = field(default_factory=list)
    raw_output: str = ""
    parse_errors: list[str] = field(default_factory=list)

    @property
    def num_detections(self) -> int:
        return len(self.boxes)

    def to_dict(self) -> dict[str, Any]:
        return {
            "num_detections": self.num_detections,
            "boxes": [b.to_dict() for b in self.boxes],
            "raw_output": self.raw_output,
            "parse_errors": self.parse_errors,
        }


BOX_PATTERN_4 = re.compile(r"<box><(\d+)><(\d+)><(\d+)><(\d+)></box>")
BOX_PATTERN_4_ALT = re.compile(r"<box>\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*,\s*(\d+)\s*</box>")
BOX_PATTERN_2 = re.compile(r"<box><(\d+)><(\d+)></box>")


def _norm_to_pixel(val: int, scale: int) -> float:
    """Convert normalized [0, 1000] coordinate to pixel coordinate."""
    return val / COORD_MAX * scale


def parse_boxes(
    raw_output: str,
    image_width: int,
    image_height: int,
) -> ParseResult:
    """Parse model output into structured bounding boxes.

    The model outputs coordinates normalized to [0, 1000].
    This function converts them to pixel coordinates.
    """
    result = ParseResult(raw_output=raw_output)
    seen: set[tuple[float, float, float, float]] = set()

    for match in BOX_PATTERN_4.finditer(raw_output):
        try:
            x1 = _norm_to_pixel(int(match.group(1)), image_width)
            y1 = _norm_to_pixel(int(match.group(2)), image_height)
            x2 = _norm_to_pixel(int(match.group(3)), image_width)
            y2 = _norm_to_pixel(int(match.group(4)), image_height)
            key = (round(x1, 1), round(y1, 1), round(x2, 1), round(y2, 1))
            if key not in seen:
                seen.add(key)
                box = BBox(x1=x1, y1=y1, x2=x2, y2=y2)
                if box.is_valid(image_width, image_height):
                    result.boxes.append(box)
                else:
                    result.parse_errors.append(f"Out-of-bounds box discarded: {key}")
        except (ValueError, IndexError) as exc:
            result.parse_errors.append(f"Failed to parse box: {exc}")

    if not result.boxes:
        for match in BOX_PATTERN_4_ALT.finditer(raw_output):
            try:
                x1 = _norm_to_pixel(int(match.group(1)), image_width)
                y1 = _norm_to_pixel(int(match.group(2)), image_height)
                x2 = _norm_to_pixel(int(match.group(3)), image_width)
                y2 = _norm_to_pixel(int(match.group(4)), image_height)
                key = (round(x1, 1), round(y1, 1), round(x2, 1), round(y2, 1))
                if key not in seen:
                    seen.add(key)
                    box = BBox(x1=x1, y1=y1, x2=x2, y2=y2)
                    if box.is_valid(image_width, image_height):
                        result.boxes.append(box)
            except (ValueError, IndexError) as exc:
                result.parse_errors.append(f"Failed to parse alt box: {exc}")

    return result


def parse_points(
    raw_output: str,
    image_width: int,
    image_height: int,
) -> list[dict[str, float]]:
    """Parse model output into pixel-coordinate points."""
    points = []
    for match in BOX_PATTERN_2.finditer(raw_output):
        try:
            x = _norm_to_pixel(int(match.group(1)), image_width)
            y = _norm_to_pixel(int(match.group(2)), image_height)
            points.append({"x": round(x, 2), "y": round(y, 2)})
        except (ValueError, IndexError):
            pass
    return points