CU1-X / ui /shared_interface.py
abdelkader
Fix Gradio JSON schema error by returning JSON as string
e585852
"""
Shared Gradio Interface Factory
This module provides a reusable Gradio interface factory that works with
different detection backends (direct service or API client).
This eliminates code duplication between app.py and ui/gradio_interface.py
"""
import os
import gradio as gr
from typing import Callable, Optional
def _handle_ocr_only_toggle(is_ocr_only: bool):
"""
Update dependent controls when OCR-only mode is toggled.
Returns tuple of updates for:
- CLIP checkbox
- OCR checkbox
- BLIP checkbox
- BLIP scope radio
"""
if is_ocr_only:
return (
gr.update(value=False, interactive=False),
gr.update(value=True, interactive=False),
gr.update(value=False, interactive=False),
gr.update(value="Only image & button", visible=False),
)
return (
gr.update(interactive=True),
gr.update(value=True, interactive=True),
gr.update(interactive=True),
gr.update(visible=False),
)
def create_interface(
detection_fn: Callable,
title_suffix: str = "",
show_api_info: bool = False,
api_url: Optional[str] = None
) -> gr.Blocks:
"""
Create a Gradio interface with a pluggable detection function
Args:
detection_fn: Function that takes (image, confidence, thickness, clip, ocr, blip, ocr_only, blip_scope)
and returns (annotated_image, summary, json_data)
title_suffix: Additional text for the title
show_api_info: Whether to show API connection info
api_url: API URL to display (if show_api_info=True)
Returns:
Gradio Blocks interface
"""
with gr.Blocks(title="CU-1 UI Element Detector", theme=gr.themes.Soft()) as interface:
# Build title markdown
title_parts = [
"# 🎯 CU-1 UI Element Detector",
"",
"Detect interactive elements in screenshots and UI mockups.",
"",
"**Multi-Model Pipeline:**",
"- πŸ” **RF-DETR** detects all UI elements (single class detection)",
"- 🏷️ **CLIP** classifies elements into 6 types (button, input, text, image, list_item, navigation)",
"- πŸ“ **OCR** extracts text content from detected elements",
"- πŸ–ΌοΈ **BLIP** generates visual descriptions for icons"
]
if title_suffix:
title_parts.append("")
title_parts.append(f"**{title_suffix}**")
if show_api_info and api_url:
title_parts.append("")
title_parts.append(f"**API:** Connected to `{api_url}`")
gr.Markdown("\n".join(title_parts))
with gr.Row():
with gr.Column(scale=1):
input_image = gr.Image(
type="pil",
label="Upload Screenshot",
height=400,
sources=["upload"]
)
with gr.Accordion("Detection Settings", open=True):
confidence_slider = gr.Slider(
minimum=0.1,
maximum=0.9,
value=0.35,
step=0.05,
label="Confidence Threshold",
info="Lower = more elements detected"
)
thickness_slider = gr.Slider(
minimum=1,
maximum=6,
value=2,
step=1,
label="Box Line Thickness"
)
with gr.Accordion("Feature Settings", open=True):
clip_checkbox = gr.Checkbox(
value=False,
label="Enable CLIP Classification",
info="Classify elements into types (slower but more informative)"
)
ocr_checkbox = gr.Checkbox(
value=True,
label="Enable OCR Text Extraction",
info="Extract text content from elements"
)
blip_checkbox = gr.Checkbox(
value=False,
label="Enable BLIP Description",
info="Generate visual descriptions for icons (slower)"
)
ocr_only_checkbox = gr.Checkbox(
value=False,
label="OCR-only (skip detection/classification)",
info="Run OCR across the whole image and return OCR boxes only"
)
blip_scope_radio = gr.Radio(
choices=["Only image & button", "All elements"],
value="Only image & button",
label="BLIP Scope",
info="When to apply BLIP descriptions",
visible=False
)
with gr.Accordion("🎨 Preprocessing (Cross-Device Consistency)", open=False):
preprocess_checkbox = gr.Checkbox(
value=False,
label="Enable Image Preprocessing",
info="Standardize screenshots from different devices (Samsung, Pixel, Oppo, etc.)"
)
preprocess_mode_radio = gr.Radio(
choices=["RF-DETR Optimized (Recommended)", "Generic (CLIP/OCR Focus)"],
value="RF-DETR Optimized (Recommended)",
label="Preprocessing Mode",
info="RF-DETR: Preserves ImageNet normalization | Generic: Aggressive for OCR",
visible=False
)
preprocess_preset_dropdown = gr.Dropdown(
choices=["gentle", "standard", "aggressive_denoise", "color_only"],
value="standard",
label="Preprocessing Preset",
info="gentle=minimal | standard=balanced | aggressive_denoise=strong | color_only=colors",
visible=False
)
detect_button = gr.Button("πŸ” Detect Elements", variant="primary", size="lg")
with gr.Column(scale=1):
output_image = gr.Image(
type="pil",
label="Detected Elements",
height=400
)
summary_output = gr.Markdown(label="Detection Summary")
with gr.Accordion("Raw Results (JSON)", open=False):
json_output = gr.Code(label="Detections JSON", language="json")
with gr.Accordion("API Quickstart", open=False):
api_docs = gr.Markdown(
value="\n".join([
"#### Call the Detection API",
"",
"```bash",
"curl -X POST \"https://your-space.hf.space/detect\" \\",
" -H \"Authorization: Bearer <HF_TOKEN>\" \\",
" -F \"image=@screenshot.png\" \\",
" -F \"confidence_threshold=0.35\" \\",
" -F \"enable_clip=true\" \\",
" -F \"enable_ocr=true\"",
"```",
"",
"```python",
"import requests",
"",
"url = \"https://your-space.hf.space/detect\"",
"headers = {\"Authorization\": \"Bearer <HF_TOKEN>\"}",
"files = {\"image\": open(\"screenshot.png\", \"rb\")}",
"data = {",
" \"confidence_threshold\": 0.35,",
" \"enable_clip\": \"true\",",
" \"enable_ocr\": \"true\"",
"}",
"resp = requests.post(url, files=files, data=data, headers=headers, timeout=120)",
"resp.raise_for_status()",
"print(resp.json())",
"```",
"",
"- Replace `your-space` with your Hugging Face Space slug.",
"- Add the `Authorization` header for private Spaces.",
"- Response payload includes bounding boxes, texts, and optional annotated image."
])
)
# Toggle BLIP scope visibility
blip_checkbox.change(
fn=lambda v: gr.update(visible=v),
inputs=blip_checkbox,
outputs=blip_scope_radio
)
# Handle OCR-only toggle to disable/enable related controls
ocr_only_checkbox.change(
fn=_handle_ocr_only_toggle,
inputs=ocr_only_checkbox,
outputs=[clip_checkbox, ocr_checkbox, blip_checkbox, blip_scope_radio]
)
# Toggle preprocessing options visibility
def toggle_preprocess_options(enabled):
return gr.update(visible=enabled), gr.update(visible=enabled)
preprocess_checkbox.change(
fn=toggle_preprocess_options,
inputs=preprocess_checkbox,
outputs=[preprocess_mode_radio, preprocess_preset_dropdown]
)
# Update preset choices based on mode
def update_preset_choices(mode):
if "RF-DETR" in mode:
return gr.update(
choices=["gentle", "standard", "aggressive_denoise", "color_only"],
value="standard",
info="gentle=minimal | standard=balanced | aggressive_denoise=strong | color_only=colors"
)
else: # Generic mode
return gr.update(
choices=["minimal", "standard", "aggressive", "ocr_optimized"],
value="standard",
info="minimal=light | standard=balanced | aggressive=maximum | ocr_optimized=best for text"
)
preprocess_mode_radio.change(
fn=update_preset_choices,
inputs=preprocess_mode_radio,
outputs=preprocess_preset_dropdown
)
# Connect detection button
# api_name exposes this function as /api/predict endpoint for Hugging Face Spaces
detect_button.click(
fn=detection_fn,
inputs=[
input_image,
confidence_slider,
thickness_slider,
clip_checkbox,
ocr_checkbox,
blip_checkbox,
ocr_only_checkbox,
blip_scope_radio,
preprocess_checkbox,
preprocess_mode_radio,
preprocess_preset_dropdown
],
outputs=[output_image, summary_output, json_output],
api_name="predict", # Expose as /api/predict endpoint
show_progress="full" # Show progress to user during long operations
)
# Build footer markdown
footer_parts = [
"---",
"### ⚑ Performance Tips",
"",
"- **Fast mode** (CLIP ❌, OCR βœ…): ~30-40s - Good for text extraction",
"- **Balanced mode** (CLIP βœ…, OCR βœ…): ~50-60s - Full classification + text",
"- **Ultra-fast mode** (CLIP ❌, OCR ❌): ~25-35s - Just bounding boxes",
"",
"### 🎨 Cross-Device Preprocessing",
"",
"Testing on multiple devices (Samsung, Pixel, Oppo)? **Enable preprocessing** for consistent results!",
"",
"- **RF-DETR Optimized** (Recommended): Preserves ImageNet normalization, best for detection",
"- **Generic Mode**: Aggressive normalization, best for OCR accuracy",
"",
"### πŸ—οΈ Architecture",
"",
"**Single-Class Detection:** RF-DETR detects generic \"UI elements\" (one class)",
"**Multi-Class Classification:** CLIP classifies detections into 6 specific types"
]
if show_api_info and api_url:
footer_parts.extend([
"",
"### πŸ”§ API Connection",
"",
f"This UI is a **client** of the API server at `{api_url}`",
"",
"**Communication:** HTTP/REST (multipart/form-data)",
"**Separation:** UI layer is completely isolated from detection logic",
"",
"To change API endpoint:",
"```bash",
"export CU1_API_URL=http://your-api-server:8000",
"python app_ui.py",
"```"
])
else:
footer_parts.extend([
"",
"### πŸ“¦ Deployment",
"",
"This app uses direct detection service access (no API layer).",
"Optimized for Hugging Face Spaces and local testing."
])
gr.Markdown("\n".join(footer_parts))
return interface