File size: 13,580 Bytes
77da9e2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
"""
Shared Gradio Interface Factory

This module provides a reusable Gradio interface factory that works with
different detection backends (direct service or API client).

This eliminates code duplication between app.py and ui/gradio_interface.py
"""

import gradio as gr
from typing import Callable, Optional


def _handle_ocr_only_toggle(is_ocr_only: bool):
    """
    Update dependent controls when OCR-only mode is toggled.

    Returns tuple of updates for:
        - CLIP checkbox
        - OCR checkbox
        - BLIP checkbox
        - BLIP scope radio
    """
    if is_ocr_only:
        return (
            gr.update(value=False, interactive=False),
            gr.update(value=True, interactive=False),
            gr.update(value=False, interactive=False),
            gr.update(value="Only image & button", visible=False),
        )
    return (
        gr.update(interactive=True),
        gr.update(value=True, interactive=True),
        gr.update(interactive=True),
        gr.update(visible=False),
    )


def create_interface(
    detection_fn: Callable,
    title_suffix: str = "",
    show_api_info: bool = False,
    api_url: Optional[str] = None
) -> gr.Blocks:
    """
    Create a Gradio interface with a pluggable detection function
    
    Args:
        detection_fn: Function that takes (image, confidence, thickness, clip, ocr, blip, ocr_only, blip_scope)
                     and returns (annotated_image, summary, json_data)
        title_suffix: Additional text for the title
        show_api_info: Whether to show API connection info
        api_url: API URL to display (if show_api_info=True)
        
    Returns:
        Gradio Blocks interface
    """
    
    with gr.Blocks(title="CU-1 UI Element Detector", theme=gr.themes.Soft()) as interface:
        
        # Build title markdown
        title_parts = [
            "# 🎯 CU-1 UI Element Detector",
            "",
            "Detect interactive elements in screenshots and UI mockups.",
            "",
            "**Multi-Model Pipeline:**",
            "- πŸ” **RF-DETR** detects all UI elements (single class detection)",
            "- 🏷️ **CLIP** classifies elements into 6 types (button, input, text, image, list_item, navigation)",
            "- πŸ“ **OCR** extracts text content from detected elements",
            "- πŸ–ΌοΈ **BLIP** generates visual descriptions for icons"
        ]
        
        if title_suffix:
            title_parts.append("")
            title_parts.append(f"**{title_suffix}**")
        
        if show_api_info and api_url:
            title_parts.append("")
            title_parts.append(f"**API:** Connected to `{api_url}`")
        
        gr.Markdown("\n".join(title_parts))
        
        with gr.Row():
            with gr.Column(scale=1):
                input_image = gr.Image(
                    type="pil",
                    label="Upload Screenshot",
                    height=400,
                    sources=["upload"]
                )
                
                with gr.Accordion("Detection Settings", open=True):
                    confidence_slider = gr.Slider(
                        minimum=0.1,
                        maximum=0.9,
                        value=0.35,
                        step=0.05,
                        label="Confidence Threshold",
                        info="Lower = more elements detected"
                    )
                    
                    thickness_slider = gr.Slider(
                        minimum=1,
                        maximum=6,
                        value=2,
                        step=1,
                        label="Box Line Thickness"
                    )
                
                with gr.Accordion("Feature Settings", open=True):
                    clip_checkbox = gr.Checkbox(
                        value=False,
                        label="Enable CLIP Classification",
                        info="Classify elements into types (slower but more informative)"
                    )
                    
                    ocr_checkbox = gr.Checkbox(
                        value=True,
                        label="Enable OCR Text Extraction",
                        info="Extract text content from elements"
                    )
                    
                    blip_checkbox = gr.Checkbox(
                        value=False,
                        label="Enable BLIP Description",
                        info="Generate visual descriptions for icons (slower)"
                    )
                    
                    ocr_only_checkbox = gr.Checkbox(
                        value=False,
                        label="OCR-only (skip detection/classification)",
                        info="Run OCR across the whole image and return OCR boxes only"
                    )
                    
                    blip_scope_radio = gr.Radio(
                        choices=["Only image & button", "All elements"],
                        value="Only image & button",
                        label="BLIP Scope",
                        info="When to apply BLIP descriptions",
                        visible=False
                    )
                
                with gr.Accordion("🎨 Preprocessing (Cross-Device Consistency)", open=False):
                    preprocess_checkbox = gr.Checkbox(
                        value=False,
                        label="Enable Image Preprocessing",
                        info="Standardize screenshots from different devices (Samsung, Pixel, Oppo, etc.)"
                    )
                    
                    preprocess_mode_radio = gr.Radio(
                        choices=["RF-DETR Optimized (Recommended)", "Generic (CLIP/OCR Focus)"],
                        value="RF-DETR Optimized (Recommended)",
                        label="Preprocessing Mode",
                        info="RF-DETR: Preserves ImageNet normalization | Generic: Aggressive for OCR",
                        visible=False
                    )
                    
                    preprocess_preset_dropdown = gr.Dropdown(
                        choices=["gentle", "standard", "aggressive_denoise", "color_only"],
                        value="standard",
                        label="Preprocessing Preset",
                        info="gentle=minimal | standard=balanced | aggressive_denoise=strong | color_only=colors",
                        visible=False
                    )
                
                detect_button = gr.Button("πŸ” Detect Elements", variant="primary", size="lg")
            
            with gr.Column(scale=1):
                output_image = gr.Image(
                    type="pil",
                    label="Detected Elements",
                    height=400
                )
                
                summary_output = gr.Markdown(label="Detection Summary")
                
                with gr.Accordion("Raw Results (JSON)", open=False):
                    json_output = gr.JSON(label="Detections JSON")

                with gr.Accordion("API Quickstart", open=False):
                    api_docs = gr.Markdown(
                        value="\n".join([
                            "#### Call the Detection API",
                            "",
                            "```bash",
                            "curl -X POST \"https://your-space.hf.space/detect\" \\",
                            "  -H \"Authorization: Bearer <HF_TOKEN>\" \\",
                            "  -F \"image=@screenshot.png\" \\",
                            "  -F \"confidence_threshold=0.35\" \\",
                            "  -F \"enable_clip=true\" \\",
                            "  -F \"enable_ocr=true\"",
                            "```",
                            "",
                            "```python",
                            "import requests",
                            "",
                            "url = \"https://your-space.hf.space/detect\"",
                            "headers = {\"Authorization\": \"Bearer <HF_TOKEN>\"}",
                            "files = {\"image\": open(\"screenshot.png\", \"rb\")}",
                            "data = {",
                            "    \"confidence_threshold\": 0.35,",
                            "    \"enable_clip\": \"true\",",
                            "    \"enable_ocr\": \"true\"",
                            "}",
                            "resp = requests.post(url, files=files, data=data, headers=headers, timeout=120)",
                            "resp.raise_for_status()",
                            "print(resp.json())",
                            "```",
                            "",
                            "- Replace `your-space` with your Hugging Face Space slug.",
                            "- Add the `Authorization` header for private Spaces.",
                            "- Response payload includes bounding boxes, texts, and optional annotated image."
                        ])
                    )
        
        # Toggle BLIP scope visibility
        blip_checkbox.change(
            fn=lambda v: gr.update(visible=v),
            inputs=blip_checkbox,
            outputs=blip_scope_radio
        )

        # Handle OCR-only toggle to disable/enable related controls
        ocr_only_checkbox.change(
            fn=_handle_ocr_only_toggle,
            inputs=ocr_only_checkbox,
            outputs=[clip_checkbox, ocr_checkbox, blip_checkbox, blip_scope_radio]
        )
        
        # Toggle preprocessing options visibility
        def toggle_preprocess_options(enabled):
            return gr.update(visible=enabled), gr.update(visible=enabled)
        
        preprocess_checkbox.change(
            fn=toggle_preprocess_options,
            inputs=preprocess_checkbox,
            outputs=[preprocess_mode_radio, preprocess_preset_dropdown]
        )
        
        # Update preset choices based on mode
        def update_preset_choices(mode):
            if "RF-DETR" in mode:
                return gr.update(
                    choices=["gentle", "standard", "aggressive_denoise", "color_only"],
                    value="standard",
                    info="gentle=minimal | standard=balanced | aggressive_denoise=strong | color_only=colors"
                )
            else:  # Generic mode
                return gr.update(
                    choices=["minimal", "standard", "aggressive", "ocr_optimized"],
                    value="standard",
                    info="minimal=light | standard=balanced | aggressive=maximum | ocr_optimized=best for text"
                )
        
        preprocess_mode_radio.change(
            fn=update_preset_choices,
            inputs=preprocess_mode_radio,
            outputs=preprocess_preset_dropdown
        )

        # Connect detection button
        detect_button.click(
            fn=detection_fn,
            inputs=[
                input_image,
                confidence_slider,
                thickness_slider,
                clip_checkbox,
                ocr_checkbox,
                blip_checkbox,
                ocr_only_checkbox,
                blip_scope_radio,
                preprocess_checkbox,
                preprocess_mode_radio,
                preprocess_preset_dropdown
            ],
            outputs=[output_image, summary_output, json_output]
        )
        
        # Build footer markdown
        footer_parts = [
            "---",
            "### ⚑ Performance Tips",
            "",
            "- **Fast mode** (CLIP ❌, OCR βœ…): ~30-40s - Good for text extraction",
            "- **Balanced mode** (CLIP βœ…, OCR βœ…): ~50-60s - Full classification + text",
            "- **Ultra-fast mode** (CLIP ❌, OCR ❌): ~25-35s - Just bounding boxes",
            "",
            "### 🎨 Cross-Device Preprocessing",
            "",
            "Testing on multiple devices (Samsung, Pixel, Oppo)? **Enable preprocessing** for consistent results!",
            "",
            "- **RF-DETR Optimized** (Recommended): Preserves ImageNet normalization, best for detection",
            "- **Generic Mode**: Aggressive normalization, best for OCR accuracy",
            "",
            "### πŸ—οΈ Architecture",
            "",
            "**Single-Class Detection:** RF-DETR detects generic \"UI elements\" (one class)",
            "**Multi-Class Classification:** CLIP classifies detections into 6 specific types"
        ]
        
        if show_api_info and api_url:
            footer_parts.extend([
                "",
                "### πŸ”§ API Connection",
                "",
                f"This UI is a **client** of the API server at `{api_url}`",
                "",
                "**Communication:** HTTP/REST (multipart/form-data)",
                "**Separation:** UI layer is completely isolated from detection logic",
                "",
                "To change API endpoint:",
                "```bash",
                "export CU1_API_URL=http://your-api-server:8000",
                "python app_ui.py",
                "```"
            ])
        else:
            footer_parts.extend([
                "",
                "### πŸ“¦ Deployment",
                "",
                "This app uses direct detection service access (no API layer).",
                "Optimized for Hugging Face Spaces and local testing."
            ])
        
        gr.Markdown("\n".join(footer_parts))
    
    return interface