""" Response Builder - Standardized Response Formatting This module provides utilities for formatting detection results into standardized response formats for API and UI consumption. """ import base64 import cv2 import numpy as np from typing import Dict, List, Optional, Any from PIL import Image def build_detection_response( analysis: Dict, image: Image.Image, annotated_image: Optional[np.ndarray] = None, confidence_threshold: float = 0.35, line_thickness: int = 2, enable_clip: bool = False, enable_ocr: bool = True, enable_blip: bool = False, blip_scope: Optional[str] = None, ocr_only: bool = False, include_annotated_image: bool = True ) -> Dict: """ Build standardized detection response for API/UI Args: analysis: Detection analysis results from DetectionService or OCR handler image: Original PIL Image annotated_image: Optional annotated image (numpy array, RGB) confidence_threshold: Confidence threshold used enable_clip: Whether CLIP classification was enabled enable_ocr: Whether OCR was enabled enable_blip: Whether BLIP was enabled blip_scope: BLIP scope ("icons" or "all") ocr_only: Whether this was OCR-only mode include_annotated_image: Whether to include base64-encoded annotated image Returns: Standardized response dictionary with detections, metadata, and parameters """ # Extract detections detections = analysis.get("detections", []) # Build type distribution if CLIP is enabled type_counts = None if enable_clip and not ocr_only: type_counts = build_type_distribution(detections) # Prepare response response = { "success": True, "detections": detections, "total_detections": len(detections), "image_size": analysis.get("image_size", {"width": image.width, "height": image.height}), "parameters": { "confidence_threshold": confidence_threshold, "line_thickness": line_thickness, "enable_clip": enable_clip if not ocr_only else False, "enable_ocr": enable_ocr if not ocr_only else False, "enable_blip": enable_blip if not ocr_only else False, "blip_scope": blip_scope if enable_blip and not ocr_only else None, "ocr_only": ocr_only }, "type_distribution": type_counts } # Add annotated image if requested if include_annotated_image and annotated_image is not None: # Encode as base64 PNG img_bgr = cv2.cvtColor(annotated_image, cv2.COLOR_RGB2BGR) ok, png_bytes = cv2.imencode(".png", img_bgr) if ok: annotated_b64 = base64.b64encode(png_bytes.tobytes()).decode("ascii") response["annotated_image"] = { "mime": "image/png", "base64": annotated_b64 } return response def build_type_distribution(detections: List[Dict]) -> Dict[str, int]: """ Build element type distribution from detections Args: detections: List of detection dictionaries with class_name field Returns: Dictionary mapping class names to counts """ type_counts = {} for det in detections: class_name = det.get("class_name", "") if class_name: # Only count if class_name is not empty type_counts[class_name] = type_counts.get(class_name, 0) + 1 return type_counts def format_summary_text( detections: List[Dict], parameters: Dict, ocr_only: bool = False ) -> str: """ Format detection results as markdown summary text for Gradio UI Args: detections: List of detection dictionaries parameters: Detection parameters used ocr_only: Whether this was OCR-only mode Returns: Markdown-formatted summary string """ lines = [] if ocr_only: lines.append("**OCR-only mode**") lines.append(f"**Total OCR texts:** {len(detections)}") else: lines.append(f"**Total detections:** {len(detections)}") lines.append("") lines.append("**Settings:**") lines.append(f"- Confidence threshold: {parameters.get('confidence_threshold', 0.35):.2f}") enable_clip = parameters.get('enable_clip', False) enable_ocr = parameters.get('enable_ocr', True) enable_blip = parameters.get('enable_blip', False) blip_scope = parameters.get('blip_scope') line_thickness = parameters.get('line_thickness') lines.append(f"- CLIP classification: {'✅ Enabled' if enable_clip else '❌ Disabled'}") lines.append(f"- OCR text extraction: {'✅ Enabled' if enable_ocr or ocr_only else '❌ Disabled'}") if line_thickness is not None: lines.append(f"- Box line thickness: {line_thickness}") blip_text = f"- BLIP description: {'✅ Enabled' if enable_blip else '❌ Disabled'}" if enable_blip and blip_scope: scope_display = "All elements" if blip_scope == "all" else "Only image & button" blip_text += f" (scope: {scope_display})" lines.append(blip_text) # Add type distribution if CLIP is enabled if enable_clip and not ocr_only and len(detections) > 0: type_counts = build_type_distribution(detections) if type_counts: lines.append("") lines.append("**Element types:**") for typ, count in sorted(type_counts.items(), key=lambda x: -x[1]): lines.append(f"- {typ}: {count}") return "\n".join(lines) def build_ocr_only_response( detections: List[Dict], image_width: int, image_height: int, annotated_image: Optional[np.ndarray] = None, confidence_threshold: float = 0.35, line_thickness: int = 2 ) -> Dict: """ Build response specifically for OCR-only mode Args: detections: List of OCR detections image_width: Original image width image_height: Original image height annotated_image: Optional annotated image (numpy array, RGB) confidence_threshold: Confidence threshold (for consistency in response) Returns: OCR-only response dictionary """ response = { "success": True, "detections": detections, "total_detections": len(detections), "image_size": {"width": image_width, "height": image_height}, "parameters": { "confidence_threshold": confidence_threshold, "line_thickness": line_thickness, "enable_clip": False, "enable_ocr": False, # Not using standard OCR flow "enable_blip": False, "blip_scope": None, "ocr_only": True }, "type_distribution": None } # Add annotated image if provided if annotated_image is not None: img_bgr = cv2.cvtColor(annotated_image, cv2.COLOR_RGB2BGR) ok, png_bytes = cv2.imencode(".png", img_bgr) if ok: annotated_b64 = base64.b64encode(png_bytes.tobytes()).decode("ascii") response["annotated_image"] = { "mime": "image/png", "base64": annotated_b64 } return response def build_simplified_response( analysis: Dict, image: Image.Image, annotated_image: Optional[np.ndarray] = None, confidence_threshold: float = 0.35, line_thickness: int = 2, enable_clip: bool = False, enable_ocr: bool = True, enable_blip: bool = False, blip_scope: Optional[str] = None, ocr_only: bool = False ) -> Dict: """ Build simplified detection response for API/UI with format: { "detections": { "icon 0": {"type": "text", "bbox": [x1, y1, x2, y2], "interactivity": false, "content": "..."}, "icon 1": {"type": "icon", "bbox": [x1, y1, x2, y2], "interactivity": true, "content": "..."} }, "annotated_image": {"mime": "image/png", "base64": "..."} } Args: analysis: Detection analysis results from DetectionService or OCR handler image: Original PIL Image annotated_image: Optional annotated image (numpy array, RGB) confidence_threshold: Confidence threshold used enable_clip: Whether CLIP classification was enabled enable_ocr: Whether OCR was enabled enable_blip: Whether BLIP was enabled blip_scope: BLIP scope ("icons" or "all") ocr_only: Whether this was OCR-only mode Returns: Simplified response dictionary with detections dict and annotated_image """ # Extract detections detections = analysis.get("detections", []) image_width = analysis.get("image_size", {}).get("width", image.width) image_height = analysis.get("image_size", {}).get("height", image.height) # Interactive element types (buttons, inputs, icons, navigation, list items) interactive_types = {"button", "input", "icon", "navigation", "list_item"} # Build simplified detections dict simplified_detections = {} for idx, det in enumerate(detections): # Get bounding box and normalize to 0-1 coordinates box = det.get("box", {}) x1 = box.get("x1", 0) / image_width y1 = box.get("y1", 0) / image_height x2 = box.get("x2", 0) / image_width y2 = box.get("y2", 0) / image_height # Get type from CLIP classification element_type = det.get("class_name", "") if not element_type: # Fallback: if no CLIP classification, default to "text" if has text, else "icon" element_type = "text" if det.get("text", "").strip() else "icon" # Determine interactivity based on type is_interactive = element_type in interactive_types # Fuse text and description into content text = det.get("text", "").strip() description = det.get("description", "").strip() # Content priority: text first, then description if text: content = text elif description: content = description else: content = "" # Build simplified detection entry simplified_detections[f"icon {idx}"] = { "type": element_type, "bbox": [round(x1, 4), round(y1, 4), round(x2, 4), round(y2, 4)], "interactivity": is_interactive, "content": content } # Build response response = { "detections": simplified_detections } # Add annotated image if provided if annotated_image is not None: img_bgr = cv2.cvtColor(annotated_image, cv2.COLOR_RGB2BGR) ok, png_bytes = cv2.imencode(".png", img_bgr) if ok: annotated_b64 = base64.b64encode(png_bytes.tobytes()).decode("ascii") response["annotated_image"] = { "mime": "image/png", "base64": annotated_b64 } return response