""" Shared Gradio Interface Factory This module provides a reusable Gradio interface factory that works with different detection backends (direct service or API client). This eliminates code duplication between app.py and ui/gradio_interface.py """ import os import gradio as gr from typing import Callable, Optional def _handle_ocr_only_toggle(is_ocr_only: bool): """ Update dependent controls when OCR-only mode is toggled. Returns tuple of updates for: - CLIP checkbox - OCR checkbox - BLIP checkbox - BLIP scope radio """ if is_ocr_only: return ( gr.update(value=False, interactive=False), gr.update(value=True, interactive=False), gr.update(value=False, interactive=False), gr.update(value="Only image & button", visible=False), ) return ( gr.update(interactive=True), gr.update(value=True, interactive=True), gr.update(interactive=True), gr.update(visible=False), ) def create_interface( detection_fn: Callable, title_suffix: str = "", show_api_info: bool = False, api_url: Optional[str] = None ) -> gr.Blocks: """ Create a Gradio interface with a pluggable detection function Args: detection_fn: Function that takes (image, confidence, thickness, clip, ocr, blip, ocr_only, blip_scope) and returns (annotated_image, summary, json_data) title_suffix: Additional text for the title show_api_info: Whether to show API connection info api_url: API URL to display (if show_api_info=True) Returns: Gradio Blocks interface """ with gr.Blocks(title="CU-1 UI Element Detector", theme=gr.themes.Soft()) as interface: # Build title markdown title_parts = [ "# 🎯 CU-1 UI Element Detector", "", "Detect interactive elements in screenshots and UI mockups.", "", "**Multi-Model Pipeline:**", "- 🔍 **RF-DETR** detects all UI elements (single class detection)", "- 🏷️ **CLIP** classifies elements into 6 types (button, input, text, image, list_item, navigation)", "- 📝 **OCR** extracts text content from detected elements", "- 🖼️ **BLIP** generates visual descriptions for icons" ] if title_suffix: title_parts.append("") title_parts.append(f"**{title_suffix}**") if show_api_info and api_url: title_parts.append("") title_parts.append(f"**API:** Connected to `{api_url}`") gr.Markdown("\n".join(title_parts)) with gr.Row(): with gr.Column(scale=1): input_image = gr.Image( type="pil", label="Upload Screenshot", height=400, sources=["upload"] ) with gr.Accordion("Detection Settings", open=True): confidence_slider = gr.Slider( minimum=0.1, maximum=0.9, value=0.35, step=0.05, label="Confidence Threshold", info="Lower = more elements detected" ) thickness_slider = gr.Slider( minimum=1, maximum=6, value=2, step=1, label="Box Line Thickness" ) with gr.Accordion("Feature Settings", open=True): clip_checkbox = gr.Checkbox( value=False, label="Enable CLIP Classification", info="Classify elements into types (slower but more informative)" ) ocr_checkbox = gr.Checkbox( value=True, label="Enable OCR Text Extraction", info="Extract text content from elements" ) blip_checkbox = gr.Checkbox( value=False, label="Enable BLIP Description", info="Generate visual descriptions for icons (slower)" ) ocr_only_checkbox = gr.Checkbox( value=False, label="OCR-only (skip detection/classification)", info="Run OCR across the whole image and return OCR boxes only" ) blip_scope_radio = gr.Radio( choices=["Only image & button", "All elements"], value="Only image & button", label="BLIP Scope", info="When to apply BLIP descriptions", visible=False ) with gr.Accordion("🎨 Preprocessing (Cross-Device Consistency)", open=False): preprocess_checkbox = gr.Checkbox( value=False, label="Enable Image Preprocessing", info="Standardize screenshots from different devices (Samsung, Pixel, Oppo, etc.)" ) preprocess_mode_radio = gr.Radio( choices=["RF-DETR Optimized (Recommended)", "Generic (CLIP/OCR Focus)"], value="RF-DETR Optimized (Recommended)", label="Preprocessing Mode", info="RF-DETR: Preserves ImageNet normalization | Generic: Aggressive for OCR", visible=False ) preprocess_preset_dropdown = gr.Dropdown( choices=["gentle", "standard", "aggressive_denoise", "color_only"], value="standard", label="Preprocessing Preset", info="gentle=minimal | standard=balanced | aggressive_denoise=strong | color_only=colors", visible=False ) detect_button = gr.Button("🔍 Detect Elements", variant="primary", size="lg") with gr.Column(scale=1): output_image = gr.Image( type="pil", label="Detected Elements", height=400 ) summary_output = gr.Markdown(label="Detection Summary") with gr.Accordion("Raw Results (JSON)", open=False): json_output = gr.Code(label="Detections JSON", language="json") with gr.Accordion("API Quickstart", open=False): api_docs = gr.Markdown( value="\n".join([ "#### Call the Detection API", "", "```bash", "curl -X POST \"https://your-space.hf.space/detect\" \\", " -H \"Authorization: Bearer \" \\", " -F \"image=@screenshot.png\" \\", " -F \"confidence_threshold=0.35\" \\", " -F \"enable_clip=true\" \\", " -F \"enable_ocr=true\"", "```", "", "```python", "import requests", "", "url = \"https://your-space.hf.space/detect\"", "headers = {\"Authorization\": \"Bearer \"}", "files = {\"image\": open(\"screenshot.png\", \"rb\")}", "data = {", " \"confidence_threshold\": 0.35,", " \"enable_clip\": \"true\",", " \"enable_ocr\": \"true\"", "}", "resp = requests.post(url, files=files, data=data, headers=headers, timeout=120)", "resp.raise_for_status()", "print(resp.json())", "```", "", "- Replace `your-space` with your Hugging Face Space slug.", "- Add the `Authorization` header for private Spaces.", "- Response payload includes bounding boxes, texts, and optional annotated image." ]) ) # Toggle BLIP scope visibility blip_checkbox.change( fn=lambda v: gr.update(visible=v), inputs=blip_checkbox, outputs=blip_scope_radio ) # Handle OCR-only toggle to disable/enable related controls ocr_only_checkbox.change( fn=_handle_ocr_only_toggle, inputs=ocr_only_checkbox, outputs=[clip_checkbox, ocr_checkbox, blip_checkbox, blip_scope_radio] ) # Toggle preprocessing options visibility def toggle_preprocess_options(enabled): return gr.update(visible=enabled), gr.update(visible=enabled) preprocess_checkbox.change( fn=toggle_preprocess_options, inputs=preprocess_checkbox, outputs=[preprocess_mode_radio, preprocess_preset_dropdown] ) # Update preset choices based on mode def update_preset_choices(mode): if "RF-DETR" in mode: return gr.update( choices=["gentle", "standard", "aggressive_denoise", "color_only"], value="standard", info="gentle=minimal | standard=balanced | aggressive_denoise=strong | color_only=colors" ) else: # Generic mode return gr.update( choices=["minimal", "standard", "aggressive", "ocr_optimized"], value="standard", info="minimal=light | standard=balanced | aggressive=maximum | ocr_optimized=best for text" ) preprocess_mode_radio.change( fn=update_preset_choices, inputs=preprocess_mode_radio, outputs=preprocess_preset_dropdown ) # Connect detection button # api_name exposes this function as /api/predict endpoint for Hugging Face Spaces detect_button.click( fn=detection_fn, inputs=[ input_image, confidence_slider, thickness_slider, clip_checkbox, ocr_checkbox, blip_checkbox, ocr_only_checkbox, blip_scope_radio, preprocess_checkbox, preprocess_mode_radio, preprocess_preset_dropdown ], outputs=[output_image, summary_output, json_output], api_name="predict", # Expose as /api/predict endpoint show_progress="full" # Show progress to user during long operations ) # Build footer markdown footer_parts = [ "---", "### ⚡ Performance Tips", "", "- **Fast mode** (CLIP ❌, OCR ✅): ~30-40s - Good for text extraction", "- **Balanced mode** (CLIP ✅, OCR ✅): ~50-60s - Full classification + text", "- **Ultra-fast mode** (CLIP ❌, OCR ❌): ~25-35s - Just bounding boxes", "", "### 🎨 Cross-Device Preprocessing", "", "Testing on multiple devices (Samsung, Pixel, Oppo)? **Enable preprocessing** for consistent results!", "", "- **RF-DETR Optimized** (Recommended): Preserves ImageNet normalization, best for detection", "- **Generic Mode**: Aggressive normalization, best for OCR accuracy", "", "### 🏗️ Architecture", "", "**Single-Class Detection:** RF-DETR detects generic \"UI elements\" (one class)", "**Multi-Class Classification:** CLIP classifies detections into 6 specific types" ] if show_api_info and api_url: footer_parts.extend([ "", "### 🔧 API Connection", "", f"This UI is a **client** of the API server at `{api_url}`", "", "**Communication:** HTTP/REST (multipart/form-data)", "**Separation:** UI layer is completely isolated from detection logic", "", "To change API endpoint:", "```bash", "export CU1_API_URL=http://your-api-server:8000", "python app_ui.py", "```" ]) else: footer_parts.extend([ "", "### 📦 Deployment", "", "This app uses direct detection service access (no API layer).", "Optimized for Hugging Face Spaces and local testing." ]) gr.Markdown("\n".join(footer_parts)) return interface