Spaces:

AI-DrivenTesting
/

CU1-X

Sleeping

File size: 13,580 Bytes

77da9e2

"""
Shared Gradio Interface Factory

This module provides a reusable Gradio interface factory that works with
different detection backends (direct service or API client).

This eliminates code duplication between app.py and ui/gradio_interface.py
"""

import gradio as gr
from typing import Callable, Optional


def _handle_ocr_only_toggle(is_ocr_only: bool):
    """
    Update dependent controls when OCR-only mode is toggled.

    Returns tuple of updates for:
        - CLIP checkbox
        - OCR checkbox
        - BLIP checkbox
        - BLIP scope radio
    """
    if is_ocr_only:
        return (
            gr.update(value=False, interactive=False),
            gr.update(value=True, interactive=False),
            gr.update(value=False, interactive=False),
            gr.update(value="Only image & button", visible=False),
        )
    return (
        gr.update(interactive=True),
        gr.update(value=True, interactive=True),
        gr.update(interactive=True),
        gr.update(visible=False),
    )


def create_interface(
    detection_fn: Callable,
    title_suffix: str = "",
    show_api_info: bool = False,
    api_url: Optional[str] = None
) -> gr.Blocks:
    """
    Create a Gradio interface with a pluggable detection function
    
    Args:
        detection_fn: Function that takes (image, confidence, thickness, clip, ocr, blip, ocr_only, blip_scope)
                     and returns (annotated_image, summary, json_data)
        title_suffix: Additional text for the title
        show_api_info: Whether to show API connection info
        api_url: API URL to display (if show_api_info=True)
        
    Returns:
        Gradio Blocks interface
    """
    
    with gr.Blocks(title="CU-1 UI Element Detector", theme=gr.themes.Soft()) as interface:
        
        # Build title markdown
        title_parts = [
            "# 🎯 CU-1 UI Element Detector",
            "",
            "Detect interactive elements in screenshots and UI mockups.",
            "",
            "**Multi-Model Pipeline:**",
            "- 🔍 **RF-DETR** detects all UI elements (single class detection)",
            "- 🏷️ **CLIP** classifies elements into 6 types (button, input, text, image, list_item, navigation)",
            "- 📝 **OCR** extracts text content from detected elements",
            "- 🖼️ **BLIP** generates visual descriptions for icons"
        ]
        
        if title_suffix:
            title_parts.append("")
            title_parts.append(f"**{title_suffix}**")
        
        if show_api_info and api_url:
            title_parts.append("")
            title_parts.append(f"**API:** Connected to `{api_url}`")
        
        gr.Markdown("\n".join(title_parts))
        
        with gr.Row():
            with gr.Column(scale=1):
                input_image = gr.Image(
                    type="pil",
                    label="Upload Screenshot",
                    height=400,
                    sources=["upload"]
                )
                
                with gr.Accordion("Detection Settings", open=True):
                    confidence_slider = gr.Slider(
                        minimum=0.1,
                        maximum=0.9,
                        value=0.35,
                        step=0.05,
                        label="Confidence Threshold",
                        info="Lower = more elements detected"
                    )
                    
                    thickness_slider = gr.Slider(
                        minimum=1,
                        maximum=6,
                        value=2,
                        step=1,
                        label="Box Line Thickness"
                    )
                
                with gr.Accordion("Feature Settings", open=True):
                    clip_checkbox = gr.Checkbox(
                        value=False,
                        label="Enable CLIP Classification",
                        info="Classify elements into types (slower but more informative)"
                    )
                    
                    ocr_checkbox = gr.Checkbox(
                        value=True,
                        label="Enable OCR Text Extraction",
                        info="Extract text content from elements"
                    )
                    
                    blip_checkbox = gr.Checkbox(
                        value=False,
                        label="Enable BLIP Description",
                        info="Generate visual descriptions for icons (slower)"
                    )
                    
                    ocr_only_checkbox = gr.Checkbox(
                        value=False,
                        label="OCR-only (skip detection/classification)",
                        info="Run OCR across the whole image and return OCR boxes only"
                    )
                    
                    blip_scope_radio = gr.Radio(
                        choices=["Only image & button", "All elements"],
                        value="Only image & button",
                        label="BLIP Scope",
                        info="When to apply BLIP descriptions",
                        visible=False
                    )
                
                with gr.Accordion("🎨 Preprocessing (Cross-Device Consistency)", open=False):
                    preprocess_checkbox = gr.Checkbox(
                        value=False,
                        label="Enable Image Preprocessing",
                        info="Standardize screenshots from different devices (Samsung, Pixel, Oppo, etc.)"
                    )
                    
                    preprocess_mode_radio = gr.Radio(
                        choices=["RF-DETR Optimized (Recommended)", "Generic (CLIP/OCR Focus)"],
                        value="RF-DETR Optimized (Recommended)",
                        label="Preprocessing Mode",
                        info="RF-DETR: Preserves ImageNet normalization | Generic: Aggressive for OCR",
                        visible=False
                    )
                    
                    preprocess_preset_dropdown = gr.Dropdown(
                        choices=["gentle", "standard", "aggressive_denoise", "color_only"],
                        value="standard",
                        label="Preprocessing Preset",
                        info="gentle=minimal | standard=balanced | aggressive_denoise=strong | color_only=colors",
                        visible=False
                    )
                
                detect_button = gr.Button("🔍 Detect Elements", variant="primary", size="lg")
            
            with gr.Column(scale=1):
                output_image = gr.Image(
                    type="pil",
                    label="Detected Elements",
                    height=400
                )
                
                summary_output = gr.Markdown(label="Detection Summary")
                
                with gr.Accordion("Raw Results (JSON)", open=False):
                    json_output = gr.JSON(label="Detections JSON")

                with gr.Accordion("API Quickstart", open=False):
                    api_docs = gr.Markdown(
                        value="\n".join([
                            "#### Call the Detection API",
                            "",
                            "```bash",
                            "curl -X POST \"https://your-space.hf.space/detect\" \\",
                            "  -H \"Authorization: Bearer <HF_TOKEN>\" \\",
                            "  -F \"image=@screenshot.png\" \\",
                            "  -F \"confidence_threshold=0.35\" \\",
                            "  -F \"enable_clip=true\" \\",
                            "  -F \"enable_ocr=true\"",
                            "```",
                            "",
                            "```python",
                            "import requests",
                            "",
                            "url = \"https://your-space.hf.space/detect\"",
                            "headers = {\"Authorization\": \"Bearer <HF_TOKEN>\"}",
                            "files = {\"image\": open(\"screenshot.png\", \"rb\")}",
                            "data = {",
                            "    \"confidence_threshold\": 0.35,",
                            "    \"enable_clip\": \"true\",",
                            "    \"enable_ocr\": \"true\"",
                            "}",
                            "resp = requests.post(url, files=files, data=data, headers=headers, timeout=120)",
                            "resp.raise_for_status()",
                            "print(resp.json())",
                            "```",
                            "",
                            "- Replace `your-space` with your Hugging Face Space slug.",
                            "- Add the `Authorization` header for private Spaces.",
                            "- Response payload includes bounding boxes, texts, and optional annotated image."
                        ])
                    )
        
        # Toggle BLIP scope visibility
        blip_checkbox.change(
            fn=lambda v: gr.update(visible=v),
            inputs=blip_checkbox,
            outputs=blip_scope_radio
        )

        # Handle OCR-only toggle to disable/enable related controls
        ocr_only_checkbox.change(
            fn=_handle_ocr_only_toggle,
            inputs=ocr_only_checkbox,
            outputs=[clip_checkbox, ocr_checkbox, blip_checkbox, blip_scope_radio]
        )
        
        # Toggle preprocessing options visibility
        def toggle_preprocess_options(enabled):
            return gr.update(visible=enabled), gr.update(visible=enabled)
        
        preprocess_checkbox.change(
            fn=toggle_preprocess_options,
            inputs=preprocess_checkbox,
            outputs=[preprocess_mode_radio, preprocess_preset_dropdown]
        )
        
        # Update preset choices based on mode
        def update_preset_choices(mode):
            if "RF-DETR" in mode:
                return gr.update(
                    choices=["gentle", "standard", "aggressive_denoise", "color_only"],
                    value="standard",
                    info="gentle=minimal | standard=balanced | aggressive_denoise=strong | color_only=colors"
                )
            else:  # Generic mode
                return gr.update(
                    choices=["minimal", "standard", "aggressive", "ocr_optimized"],
                    value="standard",
                    info="minimal=light | standard=balanced | aggressive=maximum | ocr_optimized=best for text"
                )
        
        preprocess_mode_radio.change(
            fn=update_preset_choices,
            inputs=preprocess_mode_radio,
            outputs=preprocess_preset_dropdown
        )

        # Connect detection button
        detect_button.click(
            fn=detection_fn,
            inputs=[
                input_image,
                confidence_slider,
                thickness_slider,
                clip_checkbox,
                ocr_checkbox,
                blip_checkbox,
                ocr_only_checkbox,
                blip_scope_radio,
                preprocess_checkbox,
                preprocess_mode_radio,
                preprocess_preset_dropdown
            ],
            outputs=[output_image, summary_output, json_output]
        )
        
        # Build footer markdown
        footer_parts = [
            "---",
            "### ⚡ Performance Tips",
            "",
            "- **Fast mode** (CLIP ❌, OCR ✅): ~30-40s - Good for text extraction",
            "- **Balanced mode** (CLIP ✅, OCR ✅): ~50-60s - Full classification + text",
            "- **Ultra-fast mode** (CLIP ❌, OCR ❌): ~25-35s - Just bounding boxes",
            "",
            "### 🎨 Cross-Device Preprocessing",
            "",
            "Testing on multiple devices (Samsung, Pixel, Oppo)? **Enable preprocessing** for consistent results!",
            "",
            "- **RF-DETR Optimized** (Recommended): Preserves ImageNet normalization, best for detection",
            "- **Generic Mode**: Aggressive normalization, best for OCR accuracy",
            "",
            "### 🏗️ Architecture",
            "",
            "**Single-Class Detection:** RF-DETR detects generic \"UI elements\" (one class)",
            "**Multi-Class Classification:** CLIP classifies detections into 6 specific types"
        ]
        
        if show_api_info and api_url:
            footer_parts.extend([
                "",
                "### 🔧 API Connection",
                "",
                f"This UI is a **client** of the API server at `{api_url}`",
                "",
                "**Communication:** HTTP/REST (multipart/form-data)",
                "**Separation:** UI layer is completely isolated from detection logic",
                "",
                "To change API endpoint:",
                "```bash",
                "export CU1_API_URL=http://your-api-server:8000",
                "python app_ui.py",
                "```"
            ])
        else:
            footer_parts.extend([
                "",
                "### 📦 Deployment",
                "",
                "This app uses direct detection service access (no API layer).",
                "Optimized for Hugging Face Spaces and local testing."
            ])
        
        gr.Markdown("\n".join(footer_parts))
    
    return interface