File size: 11,080 Bytes
77da9e2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bf5eae6
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
"""
Response Builder - Standardized Response Formatting

This module provides utilities for formatting detection results into
standardized response formats for API and UI consumption.
"""

import base64
import cv2
import numpy as np
from typing import Dict, List, Optional, Any
from PIL import Image


def build_detection_response(
    analysis: Dict,
    image: Image.Image,
    annotated_image: Optional[np.ndarray] = None,
    confidence_threshold: float = 0.35,
    line_thickness: int = 2,
    enable_clip: bool = False,
    enable_ocr: bool = True,
    enable_blip: bool = False,
    blip_scope: Optional[str] = None,
    ocr_only: bool = False,
    include_annotated_image: bool = True
) -> Dict:
    """
    Build standardized detection response for API/UI
    
    Args:
        analysis: Detection analysis results from DetectionService or OCR handler
        image: Original PIL Image
        annotated_image: Optional annotated image (numpy array, RGB)
        confidence_threshold: Confidence threshold used
        enable_clip: Whether CLIP classification was enabled
        enable_ocr: Whether OCR was enabled
        enable_blip: Whether BLIP was enabled
        blip_scope: BLIP scope ("icons" or "all")
        ocr_only: Whether this was OCR-only mode
        include_annotated_image: Whether to include base64-encoded annotated image
        
    Returns:
        Standardized response dictionary with detections, metadata, and parameters
    """
    # Extract detections
    detections = analysis.get("detections", [])
    
    # Build type distribution if CLIP is enabled
    type_counts = None
    if enable_clip and not ocr_only:
        type_counts = build_type_distribution(detections)
    
    # Prepare response
    response = {
        "success": True,
        "detections": detections,
        "total_detections": len(detections),
        "image_size": analysis.get("image_size", {"width": image.width, "height": image.height}),
        "parameters": {
            "confidence_threshold": confidence_threshold,
            "line_thickness": line_thickness,
            "enable_clip": enable_clip if not ocr_only else False,
            "enable_ocr": enable_ocr if not ocr_only else False,
            "enable_blip": enable_blip if not ocr_only else False,
            "blip_scope": blip_scope if enable_blip and not ocr_only else None,
            "ocr_only": ocr_only
        },
        "type_distribution": type_counts
    }
    
    # Add annotated image if requested
    if include_annotated_image and annotated_image is not None:
        # Encode as base64 PNG
        img_bgr = cv2.cvtColor(annotated_image, cv2.COLOR_RGB2BGR)
        ok, png_bytes = cv2.imencode(".png", img_bgr)
        if ok:
            annotated_b64 = base64.b64encode(png_bytes.tobytes()).decode("ascii")
            response["annotated_image"] = {
                "mime": "image/png",
                "base64": annotated_b64
            }
    
    return response


def build_type_distribution(detections: List[Dict]) -> Dict[str, int]:
    """
    Build element type distribution from detections
    
    Args:
        detections: List of detection dictionaries with class_name field
        
    Returns:
        Dictionary mapping class names to counts
    """
    type_counts = {}
    for det in detections:
        class_name = det.get("class_name", "")
        if class_name:  # Only count if class_name is not empty
            type_counts[class_name] = type_counts.get(class_name, 0) + 1
    return type_counts


def format_summary_text(
    detections: List[Dict],
    parameters: Dict,
    ocr_only: bool = False
) -> str:
    """
    Format detection results as markdown summary text for Gradio UI
    
    Args:
        detections: List of detection dictionaries
        parameters: Detection parameters used
        ocr_only: Whether this was OCR-only mode
        
    Returns:
        Markdown-formatted summary string
    """
    lines = []
    
    if ocr_only:
        lines.append("**OCR-only mode**")
        lines.append(f"**Total OCR texts:** {len(detections)}")
    else:
        lines.append(f"**Total detections:** {len(detections)}")
    
    lines.append("")
    lines.append("**Settings:**")
    lines.append(f"- Confidence threshold: {parameters.get('confidence_threshold', 0.35):.2f}")
    
    enable_clip = parameters.get('enable_clip', False)
    enable_ocr = parameters.get('enable_ocr', True)
    enable_blip = parameters.get('enable_blip', False)
    blip_scope = parameters.get('blip_scope')
    line_thickness = parameters.get('line_thickness')
    
    lines.append(f"- CLIP classification: {'βœ… Enabled' if enable_clip else '❌ Disabled'}")
    lines.append(f"- OCR text extraction: {'βœ… Enabled' if enable_ocr or ocr_only else '❌ Disabled'}")
    if line_thickness is not None:
        lines.append(f"- Box line thickness: {line_thickness}")
    
    blip_text = f"- BLIP description: {'βœ… Enabled' if enable_blip else '❌ Disabled'}"
    if enable_blip and blip_scope:
        scope_display = "All elements" if blip_scope == "all" else "Only image & button"
        blip_text += f" (scope: {scope_display})"
    lines.append(blip_text)
    
    # Add type distribution if CLIP is enabled
    if enable_clip and not ocr_only and len(detections) > 0:
        type_counts = build_type_distribution(detections)
        if type_counts:
            lines.append("")
            lines.append("**Element types:**")
            for typ, count in sorted(type_counts.items(), key=lambda x: -x[1]):
                lines.append(f"- {typ}: {count}")
    
    return "\n".join(lines)


def build_ocr_only_response(
    detections: List[Dict],
    image_width: int,
    image_height: int,
    annotated_image: Optional[np.ndarray] = None,
    confidence_threshold: float = 0.35,
    line_thickness: int = 2
) -> Dict:
    """
    Build response specifically for OCR-only mode
    
    Args:
        detections: List of OCR detections
        image_width: Original image width
        image_height: Original image height
        annotated_image: Optional annotated image (numpy array, RGB)
        confidence_threshold: Confidence threshold (for consistency in response)
        
    Returns:
        OCR-only response dictionary
    """
    response = {
        "success": True,
        "detections": detections,
        "total_detections": len(detections),
        "image_size": {"width": image_width, "height": image_height},
        "parameters": {
            "confidence_threshold": confidence_threshold,
            "line_thickness": line_thickness,
            "enable_clip": False,
            "enable_ocr": False,  # Not using standard OCR flow
            "enable_blip": False,
            "blip_scope": None,
            "ocr_only": True
        },
        "type_distribution": None
    }
    
    # Add annotated image if provided
    if annotated_image is not None:
        img_bgr = cv2.cvtColor(annotated_image, cv2.COLOR_RGB2BGR)
        ok, png_bytes = cv2.imencode(".png", img_bgr)
        if ok:
            annotated_b64 = base64.b64encode(png_bytes.tobytes()).decode("ascii")
            response["annotated_image"] = {
                "mime": "image/png",
                "base64": annotated_b64
            }
    
    return response


def build_simplified_response(
    analysis: Dict,
    image: Image.Image,
    annotated_image: Optional[np.ndarray] = None,
    confidence_threshold: float = 0.35,
    line_thickness: int = 2,
    enable_clip: bool = False,
    enable_ocr: bool = True,
    enable_blip: bool = False,
    blip_scope: Optional[str] = None,
    ocr_only: bool = False
) -> Dict:
    """
    Build simplified detection response for API/UI with format:
    {
      "detections": {
        "icon 0": {"type": "text", "bbox": [x1, y1, x2, y2], "interactivity": false, "content": "..."},
        "icon 1": {"type": "icon", "bbox": [x1, y1, x2, y2], "interactivity": true, "content": "..."}
      },
      "annotated_image": {"mime": "image/png", "base64": "..."}
    }
    
    Args:
        analysis: Detection analysis results from DetectionService or OCR handler
        image: Original PIL Image
        annotated_image: Optional annotated image (numpy array, RGB)
        confidence_threshold: Confidence threshold used
        enable_clip: Whether CLIP classification was enabled
        enable_ocr: Whether OCR was enabled
        enable_blip: Whether BLIP was enabled
        blip_scope: BLIP scope ("icons" or "all")
        ocr_only: Whether this was OCR-only mode
        
    Returns:
        Simplified response dictionary with detections dict and annotated_image
    """
    # Extract detections
    detections = analysis.get("detections", [])
    image_width = analysis.get("image_size", {}).get("width", image.width)
    image_height = analysis.get("image_size", {}).get("height", image.height)
    
    # Interactive element types (buttons, inputs, icons, navigation, list items)
    interactive_types = {"button", "input", "icon", "navigation", "list_item"}
    
    # Build simplified detections dict
    simplified_detections = {}
    for idx, det in enumerate(detections):
        # Get bounding box and normalize to 0-1 coordinates
        box = det.get("box", {})
        x1 = box.get("x1", 0) / image_width
        y1 = box.get("y1", 0) / image_height
        x2 = box.get("x2", 0) / image_width
        y2 = box.get("y2", 0) / image_height
        
        # Get type from CLIP classification
        element_type = det.get("class_name", "")
        if not element_type:
            # Fallback: if no CLIP classification, default to "text" if has text, else "icon"
            element_type = "text" if det.get("text", "").strip() else "icon"
        
        # Determine interactivity based on type
        is_interactive = element_type in interactive_types
        
        # Fuse text and description into content
        text = det.get("text", "").strip()
        description = det.get("description", "").strip()
        
        # Content priority: text first, then description
        if text:
            content = text
        elif description:
            content = description
        else:
            content = ""
        
        # Build simplified detection entry
        simplified_detections[f"icon {idx}"] = {
            "type": element_type,
            "bbox": [round(x1, 4), round(y1, 4), round(x2, 4), round(y2, 4)],
            "interactivity": is_interactive,
            "content": content
        }
    
    # Build response
    response = {
        "detections": simplified_detections
    }
    
    # Add annotated image if provided
    if annotated_image is not None:
        img_bgr = cv2.cvtColor(annotated_image, cv2.COLOR_RGB2BGR)
        ok, png_bytes = cv2.imencode(".png", img_bgr)
        if ok:
            annotated_b64 = base64.b64encode(png_bytes.tobytes()).decode("ascii")
            response["annotated_image"] = {
                "mime": "image/png",
                "base64": annotated_b64
            }
    
    return response