Spaces:
Sleeping
Sleeping
| """ | |
| Response Builder - Standardized Response Formatting | |
| This module provides utilities for formatting detection results into | |
| standardized response formats for API and UI consumption. | |
| """ | |
| import base64 | |
| import cv2 | |
| import numpy as np | |
| from typing import Dict, List, Optional, Any | |
| from PIL import Image | |
| def build_detection_response( | |
| analysis: Dict, | |
| image: Image.Image, | |
| annotated_image: Optional[np.ndarray] = None, | |
| confidence_threshold: float = 0.35, | |
| line_thickness: int = 2, | |
| enable_clip: bool = False, | |
| enable_ocr: bool = True, | |
| enable_blip: bool = False, | |
| blip_scope: Optional[str] = None, | |
| ocr_only: bool = False, | |
| include_annotated_image: bool = True | |
| ) -> Dict: | |
| """ | |
| Build standardized detection response for API/UI | |
| Args: | |
| analysis: Detection analysis results from DetectionService or OCR handler | |
| image: Original PIL Image | |
| annotated_image: Optional annotated image (numpy array, RGB) | |
| confidence_threshold: Confidence threshold used | |
| enable_clip: Whether CLIP classification was enabled | |
| enable_ocr: Whether OCR was enabled | |
| enable_blip: Whether BLIP was enabled | |
| blip_scope: BLIP scope ("icons" or "all") | |
| ocr_only: Whether this was OCR-only mode | |
| include_annotated_image: Whether to include base64-encoded annotated image | |
| Returns: | |
| Standardized response dictionary with detections, metadata, and parameters | |
| """ | |
| # Extract detections | |
| detections = analysis.get("detections", []) | |
| # Build type distribution if CLIP is enabled | |
| type_counts = None | |
| if enable_clip and not ocr_only: | |
| type_counts = build_type_distribution(detections) | |
| # Prepare response | |
| response = { | |
| "success": True, | |
| "detections": detections, | |
| "total_detections": len(detections), | |
| "image_size": analysis.get("image_size", {"width": image.width, "height": image.height}), | |
| "parameters": { | |
| "confidence_threshold": confidence_threshold, | |
| "line_thickness": line_thickness, | |
| "enable_clip": enable_clip if not ocr_only else False, | |
| "enable_ocr": enable_ocr if not ocr_only else False, | |
| "enable_blip": enable_blip if not ocr_only else False, | |
| "blip_scope": blip_scope if enable_blip and not ocr_only else None, | |
| "ocr_only": ocr_only | |
| }, | |
| "type_distribution": type_counts | |
| } | |
| # Add annotated image if requested | |
| if include_annotated_image and annotated_image is not None: | |
| # Encode as base64 PNG | |
| img_bgr = cv2.cvtColor(annotated_image, cv2.COLOR_RGB2BGR) | |
| ok, png_bytes = cv2.imencode(".png", img_bgr) | |
| if ok: | |
| annotated_b64 = base64.b64encode(png_bytes.tobytes()).decode("ascii") | |
| response["annotated_image"] = { | |
| "mime": "image/png", | |
| "base64": annotated_b64 | |
| } | |
| return response | |
| def build_type_distribution(detections: List[Dict]) -> Dict[str, int]: | |
| """ | |
| Build element type distribution from detections | |
| Args: | |
| detections: List of detection dictionaries with class_name field | |
| Returns: | |
| Dictionary mapping class names to counts | |
| """ | |
| type_counts = {} | |
| for det in detections: | |
| class_name = det.get("class_name", "") | |
| if class_name: # Only count if class_name is not empty | |
| type_counts[class_name] = type_counts.get(class_name, 0) + 1 | |
| return type_counts | |
| def format_summary_text( | |
| detections: List[Dict], | |
| parameters: Dict, | |
| ocr_only: bool = False | |
| ) -> str: | |
| """ | |
| Format detection results as markdown summary text for Gradio UI | |
| Args: | |
| detections: List of detection dictionaries | |
| parameters: Detection parameters used | |
| ocr_only: Whether this was OCR-only mode | |
| Returns: | |
| Markdown-formatted summary string | |
| """ | |
| lines = [] | |
| if ocr_only: | |
| lines.append("**OCR-only mode**") | |
| lines.append(f"**Total OCR texts:** {len(detections)}") | |
| else: | |
| lines.append(f"**Total detections:** {len(detections)}") | |
| lines.append("") | |
| lines.append("**Settings:**") | |
| lines.append(f"- Confidence threshold: {parameters.get('confidence_threshold', 0.35):.2f}") | |
| enable_clip = parameters.get('enable_clip', False) | |
| enable_ocr = parameters.get('enable_ocr', True) | |
| enable_blip = parameters.get('enable_blip', False) | |
| blip_scope = parameters.get('blip_scope') | |
| line_thickness = parameters.get('line_thickness') | |
| lines.append(f"- CLIP classification: {'✅ Enabled' if enable_clip else '❌ Disabled'}") | |
| lines.append(f"- OCR text extraction: {'✅ Enabled' if enable_ocr or ocr_only else '❌ Disabled'}") | |
| if line_thickness is not None: | |
| lines.append(f"- Box line thickness: {line_thickness}") | |
| blip_text = f"- BLIP description: {'✅ Enabled' if enable_blip else '❌ Disabled'}" | |
| if enable_blip and blip_scope: | |
| scope_display = "All elements" if blip_scope == "all" else "Only image & button" | |
| blip_text += f" (scope: {scope_display})" | |
| lines.append(blip_text) | |
| # Add type distribution if CLIP is enabled | |
| if enable_clip and not ocr_only and len(detections) > 0: | |
| type_counts = build_type_distribution(detections) | |
| if type_counts: | |
| lines.append("") | |
| lines.append("**Element types:**") | |
| for typ, count in sorted(type_counts.items(), key=lambda x: -x[1]): | |
| lines.append(f"- {typ}: {count}") | |
| return "\n".join(lines) | |
| def build_ocr_only_response( | |
| detections: List[Dict], | |
| image_width: int, | |
| image_height: int, | |
| annotated_image: Optional[np.ndarray] = None, | |
| confidence_threshold: float = 0.35, | |
| line_thickness: int = 2 | |
| ) -> Dict: | |
| """ | |
| Build response specifically for OCR-only mode | |
| Args: | |
| detections: List of OCR detections | |
| image_width: Original image width | |
| image_height: Original image height | |
| annotated_image: Optional annotated image (numpy array, RGB) | |
| confidence_threshold: Confidence threshold (for consistency in response) | |
| Returns: | |
| OCR-only response dictionary | |
| """ | |
| response = { | |
| "success": True, | |
| "detections": detections, | |
| "total_detections": len(detections), | |
| "image_size": {"width": image_width, "height": image_height}, | |
| "parameters": { | |
| "confidence_threshold": confidence_threshold, | |
| "line_thickness": line_thickness, | |
| "enable_clip": False, | |
| "enable_ocr": False, # Not using standard OCR flow | |
| "enable_blip": False, | |
| "blip_scope": None, | |
| "ocr_only": True | |
| }, | |
| "type_distribution": None | |
| } | |
| # Add annotated image if provided | |
| if annotated_image is not None: | |
| img_bgr = cv2.cvtColor(annotated_image, cv2.COLOR_RGB2BGR) | |
| ok, png_bytes = cv2.imencode(".png", img_bgr) | |
| if ok: | |
| annotated_b64 = base64.b64encode(png_bytes.tobytes()).decode("ascii") | |
| response["annotated_image"] = { | |
| "mime": "image/png", | |
| "base64": annotated_b64 | |
| } | |
| return response | |
| def build_simplified_response( | |
| analysis: Dict, | |
| image: Image.Image, | |
| annotated_image: Optional[np.ndarray] = None, | |
| confidence_threshold: float = 0.35, | |
| line_thickness: int = 2, | |
| enable_clip: bool = False, | |
| enable_ocr: bool = True, | |
| enable_blip: bool = False, | |
| blip_scope: Optional[str] = None, | |
| ocr_only: bool = False | |
| ) -> Dict: | |
| """ | |
| Build simplified detection response for API/UI with format: | |
| { | |
| "detections": { | |
| "icon 0": {"type": "text", "bbox": [x1, y1, x2, y2], "interactivity": false, "content": "..."}, | |
| "icon 1": {"type": "icon", "bbox": [x1, y1, x2, y2], "interactivity": true, "content": "..."} | |
| }, | |
| "annotated_image": {"mime": "image/png", "base64": "..."} | |
| } | |
| Args: | |
| analysis: Detection analysis results from DetectionService or OCR handler | |
| image: Original PIL Image | |
| annotated_image: Optional annotated image (numpy array, RGB) | |
| confidence_threshold: Confidence threshold used | |
| enable_clip: Whether CLIP classification was enabled | |
| enable_ocr: Whether OCR was enabled | |
| enable_blip: Whether BLIP was enabled | |
| blip_scope: BLIP scope ("icons" or "all") | |
| ocr_only: Whether this was OCR-only mode | |
| Returns: | |
| Simplified response dictionary with detections dict and annotated_image | |
| """ | |
| # Extract detections | |
| detections = analysis.get("detections", []) | |
| image_width = analysis.get("image_size", {}).get("width", image.width) | |
| image_height = analysis.get("image_size", {}).get("height", image.height) | |
| # Interactive element types (buttons, inputs, icons, navigation, list items) | |
| interactive_types = {"button", "input", "icon", "navigation", "list_item"} | |
| # Build simplified detections dict | |
| simplified_detections = {} | |
| for idx, det in enumerate(detections): | |
| # Get bounding box and normalize to 0-1 coordinates | |
| box = det.get("box", {}) | |
| x1 = box.get("x1", 0) / image_width | |
| y1 = box.get("y1", 0) / image_height | |
| x2 = box.get("x2", 0) / image_width | |
| y2 = box.get("y2", 0) / image_height | |
| # Get type from CLIP classification | |
| element_type = det.get("class_name", "") | |
| if not element_type: | |
| # Fallback: if no CLIP classification, default to "text" if has text, else "icon" | |
| element_type = "text" if det.get("text", "").strip() else "icon" | |
| # Determine interactivity based on type | |
| is_interactive = element_type in interactive_types | |
| # Fuse text and description into content | |
| text = det.get("text", "").strip() | |
| description = det.get("description", "").strip() | |
| # Content priority: text first, then description | |
| if text: | |
| content = text | |
| elif description: | |
| content = description | |
| else: | |
| content = "" | |
| # Build simplified detection entry | |
| simplified_detections[f"icon {idx}"] = { | |
| "type": element_type, | |
| "bbox": [round(x1, 4), round(y1, 4), round(x2, 4), round(y2, 4)], | |
| "interactivity": is_interactive, | |
| "content": content | |
| } | |
| # Build response | |
| response = { | |
| "detections": simplified_detections | |
| } | |
| # Add annotated image if provided | |
| if annotated_image is not None: | |
| img_bgr = cv2.cvtColor(annotated_image, cv2.COLOR_RGB2BGR) | |
| ok, png_bytes = cv2.imencode(".png", img_bgr) | |
| if ok: | |
| annotated_b64 = base64.b64encode(png_bytes.tobytes()).decode("ascii") | |
| response["annotated_image"] = { | |
| "mime": "image/png", | |
| "base64": annotated_b64 | |
| } | |
| return response | |