Spaces:

AI-DrivenTesting
/

CU1-X

Sleeping

CU1-X / ui /shared_interface.py

abdelkader

Fix Gradio JSON schema error by returning JSON as string

e585852 6 days ago

13.8 kB

	"""
	Shared Gradio Interface Factory

	This module provides a reusable Gradio interface factory that works with
	different detection backends (direct service or API client).

	This eliminates code duplication between app.py and ui/gradio_interface.py
	"""

	import os
	import gradio as gr
	from typing import Callable, Optional


	def _handle_ocr_only_toggle(is_ocr_only: bool):
	"""
	Update dependent controls when OCR-only mode is toggled.

	Returns tuple of updates for:
	- CLIP checkbox
	- OCR checkbox
	- BLIP checkbox
	- BLIP scope radio
	"""
	if is_ocr_only:
	return (
	gr.update(value=False, interactive=False),
	gr.update(value=True, interactive=False),
	gr.update(value=False, interactive=False),
	gr.update(value="Only image & button", visible=False),
	)
	return (
	gr.update(interactive=True),
	gr.update(value=True, interactive=True),
	gr.update(interactive=True),
	gr.update(visible=False),
	)


	def create_interface(
	detection_fn: Callable,
	title_suffix: str = "",
	show_api_info: bool = False,
	api_url: Optional[str] = None
	) -> gr.Blocks:
	"""
	Create a Gradio interface with a pluggable detection function

	Args:
	detection_fn: Function that takes (image, confidence, thickness, clip, ocr, blip, ocr_only, blip_scope)
	and returns (annotated_image, summary, json_data)
	title_suffix: Additional text for the title
	show_api_info: Whether to show API connection info
	api_url: API URL to display (if show_api_info=True)

	Returns:
	Gradio Blocks interface
	"""

	with gr.Blocks(title="CU-1 UI Element Detector", theme=gr.themes.Soft()) as interface:

	# Build title markdown
	title_parts = [
	"# 🎯 CU-1 UI Element Detector",
	"",
	"Detect interactive elements in screenshots and UI mockups.",
	"",
	"Multi-Model Pipeline:",
	"- 🔍 RF-DETR detects all UI elements (single class detection)",
	"- 🏷️ CLIP classifies elements into 6 types (button, input, text, image, list_item, navigation)",
	"- 📝 OCR extracts text content from detected elements",
	"- 🖼️ BLIP generates visual descriptions for icons"
	]

	if title_suffix:
	title_parts.append("")
	title_parts.append(f"{title_suffix}")

	if show_api_info and api_url:
	title_parts.append("")
	title_parts.append(f"API: Connected to `{api_url}`")

	gr.Markdown("\n".join(title_parts))

	with gr.Row():
	with gr.Column(scale=1):
	input_image = gr.Image(
	type="pil",
	label="Upload Screenshot",
	height=400,
	sources=["upload"]
	)

	with gr.Accordion("Detection Settings", open=True):
	confidence_slider = gr.Slider(
	minimum=0.1,
	maximum=0.9,
	value=0.35,
	step=0.05,
	label="Confidence Threshold",
	info="Lower = more elements detected"
	)

	thickness_slider = gr.Slider(
	minimum=1,
	maximum=6,
	value=2,
	step=1,
	label="Box Line Thickness"
	)

	with gr.Accordion("Feature Settings", open=True):
	clip_checkbox = gr.Checkbox(
	value=False,
	label="Enable CLIP Classification",
	info="Classify elements into types (slower but more informative)"
	)

	ocr_checkbox = gr.Checkbox(
	value=True,
	label="Enable OCR Text Extraction",
	info="Extract text content from elements"
	)

	blip_checkbox = gr.Checkbox(
	value=False,
	label="Enable BLIP Description",
	info="Generate visual descriptions for icons (slower)"
	)

	ocr_only_checkbox = gr.Checkbox(
	value=False,
	label="OCR-only (skip detection/classification)",
	info="Run OCR across the whole image and return OCR boxes only"
	)

	blip_scope_radio = gr.Radio(
	choices=["Only image & button", "All elements"],
	value="Only image & button",
	label="BLIP Scope",
	info="When to apply BLIP descriptions",
	visible=False
	)

	with gr.Accordion("🎨 Preprocessing (Cross-Device Consistency)", open=False):
	preprocess_checkbox = gr.Checkbox(
	value=False,
	label="Enable Image Preprocessing",
	info="Standardize screenshots from different devices (Samsung, Pixel, Oppo, etc.)"
	)

	preprocess_mode_radio = gr.Radio(
	choices=["RF-DETR Optimized (Recommended)", "Generic (CLIP/OCR Focus)"],
	value="RF-DETR Optimized (Recommended)",
	label="Preprocessing Mode",
	info="RF-DETR: Preserves ImageNet normalization \| Generic: Aggressive for OCR",
	visible=False
	)

	preprocess_preset_dropdown = gr.Dropdown(
	choices=["gentle", "standard", "aggressive_denoise", "color_only"],
	value="standard",
	label="Preprocessing Preset",
	info="gentle=minimal \| standard=balanced \| aggressive_denoise=strong \| color_only=colors",
	visible=False
	)

	detect_button = gr.Button("🔍 Detect Elements", variant="primary", size="lg")

	with gr.Column(scale=1):
	output_image = gr.Image(
	type="pil",
	label="Detected Elements",
	height=400
	)

	summary_output = gr.Markdown(label="Detection Summary")

	with gr.Accordion("Raw Results (JSON)", open=False):
	json_output = gr.Code(label="Detections JSON", language="json")


	with gr.Accordion("API Quickstart", open=False):
	api_docs = gr.Markdown(
	value="\n".join([
	"#### Call the Detection API",
	"",
	"```bash",
	"curl -X POST \"https://your-space.hf.space/detect\" \\",
	" -H \"Authorization: Bearer <HF_TOKEN>\" \\",
	" -F \"image=@screenshot.png\" \\",
	" -F \"confidence_threshold=0.35\" \\",
	" -F \"enable_clip=true\" \\",
	" -F \"enable_ocr=true\"",
	"```",
	"",
	"```python",
	"import requests",
	"",
	"url = \"https://your-space.hf.space/detect\"",
	"headers = {\"Authorization\": \"Bearer <HF_TOKEN>\"}",
	"files = {\"image\": open(\"screenshot.png\", \"rb\")}",
	"data = {",
	" \"confidence_threshold\": 0.35,",
	" \"enable_clip\": \"true\",",
	" \"enable_ocr\": \"true\"",
	"}",
	"resp = requests.post(url, files=files, data=data, headers=headers, timeout=120)",
	"resp.raise_for_status()",
	"print(resp.json())",
	"```",
	"",
	"- Replace `your-space` with your Hugging Face Space slug.",
	"- Add the `Authorization` header for private Spaces.",
	"- Response payload includes bounding boxes, texts, and optional annotated image."
	])
	)

	# Toggle BLIP scope visibility
	blip_checkbox.change(
	fn=lambda v: gr.update(visible=v),
	inputs=blip_checkbox,
	outputs=blip_scope_radio
	)

	# Handle OCR-only toggle to disable/enable related controls
	ocr_only_checkbox.change(
	fn=_handle_ocr_only_toggle,
	inputs=ocr_only_checkbox,
	outputs=[clip_checkbox, ocr_checkbox, blip_checkbox, blip_scope_radio]
	)

	# Toggle preprocessing options visibility
	def toggle_preprocess_options(enabled):
	return gr.update(visible=enabled), gr.update(visible=enabled)

	preprocess_checkbox.change(
	fn=toggle_preprocess_options,
	inputs=preprocess_checkbox,
	outputs=[preprocess_mode_radio, preprocess_preset_dropdown]
	)

	# Update preset choices based on mode
	def update_preset_choices(mode):
	if "RF-DETR" in mode:
	return gr.update(
	choices=["gentle", "standard", "aggressive_denoise", "color_only"],
	value="standard",
	info="gentle=minimal \| standard=balanced \| aggressive_denoise=strong \| color_only=colors"
	)
	else: # Generic mode
	return gr.update(
	choices=["minimal", "standard", "aggressive", "ocr_optimized"],
	value="standard",
	info="minimal=light \| standard=balanced \| aggressive=maximum \| ocr_optimized=best for text"
	)

	preprocess_mode_radio.change(
	fn=update_preset_choices,
	inputs=preprocess_mode_radio,
	outputs=preprocess_preset_dropdown
	)

	# Connect detection button
	# api_name exposes this function as /api/predict endpoint for Hugging Face Spaces
	detect_button.click(
	fn=detection_fn,
	inputs=[
	input_image,
	confidence_slider,
	thickness_slider,
	clip_checkbox,
	ocr_checkbox,
	blip_checkbox,
	ocr_only_checkbox,
	blip_scope_radio,
	preprocess_checkbox,
	preprocess_mode_radio,
	preprocess_preset_dropdown
	],
	outputs=[output_image, summary_output, json_output],
	api_name="predict", # Expose as /api/predict endpoint
	show_progress="full" # Show progress to user during long operations
	)

	# Build footer markdown
	footer_parts = [
	"---",
	"### ⚡ Performance Tips",
	"",
	"- Fast mode (CLIP ❌, OCR ✅): ~30-40s - Good for text extraction",
	"- Balanced mode (CLIP ✅, OCR ✅): ~50-60s - Full classification + text",
	"- Ultra-fast mode (CLIP ❌, OCR ❌): ~25-35s - Just bounding boxes",
	"",
	"### 🎨 Cross-Device Preprocessing",
	"",
	"Testing on multiple devices (Samsung, Pixel, Oppo)? Enable preprocessing for consistent results!",
	"",
	"- RF-DETR Optimized (Recommended): Preserves ImageNet normalization, best for detection",
	"- Generic Mode: Aggressive normalization, best for OCR accuracy",
	"",
	"### 🏗️ Architecture",
	"",
	"Single-Class Detection: RF-DETR detects generic \"UI elements\" (one class)",
	"Multi-Class Classification: CLIP classifies detections into 6 specific types"
	]

	if show_api_info and api_url:
	footer_parts.extend([
	"",
	"### 🔧 API Connection",
	"",
	f"This UI is a client of the API server at `{api_url}`",
	"",
	"Communication: HTTP/REST (multipart/form-data)",
	"Separation: UI layer is completely isolated from detection logic",
	"",
	"To change API endpoint:",
	"```bash",
	"export CU1_API_URL=http://your-api-server:8000",
	"python app_ui.py",
	"```"
	])
	else:
	footer_parts.extend([
	"",
	"### 📦 Deployment",
	"",
	"This app uses direct detection service access (no API layer).",
	"Optimized for Hugging Face Spaces and local testing."
	])

	gr.Markdown("\n".join(footer_parts))

	return interface