Spaces:

PSynx
/

widget-detector-demo

Running

App Files Files Community

widget-detector-demo / app.py

PSynx

Upload app.py with huggingface_hub

9a9783f verified 12 days ago

raw

history blame contribute delete

16.3 kB

	"""
	Widget Detector — Hugging Face Spaces Demo
	==========================================
	Drag-and-drop a PDF or image to detect form widgets (text inputs,
	checkboxes, signatures) using YOLO11m fine-tuned on CommonForms.

	Features:
	- Visual bounding box overlay (Tab 1)
	- Raw JSON output for developers (Tab 2)
	- Download Fillable PDF — converts detections into interactive PDF form fields
	"""

	from __future__ import annotations

	import io
	import json
	import tempfile
	from pathlib import Path

	import cv2
	import gradio as gr
	import numpy as np
	from PIL import Image

	# ─── Colour palette ────────────────────────────────────────────────────────────
	CLASS_COLORS_BGR = {
	"text_input": (217, 144, 74), # blue (#4A90D9 → BGR)
	"choice_button": (60, 76, 231), # red (#E74C3C → BGR)
	"signature": (18, 156, 243), # gold (#F39C12 → BGR)
	}
	CLASS_EMOJIS = {
	"text_input": "🟦",
	"choice_button": "🟥",
	"signature": "🟨",
	}

	# Render DPI used for both visualization and fillable PDF coordinate mapping
	RENDER_DPI = 200

	# ─── Global model (loaded once per worker) ─────────────────────────────────────
	_detector = None


	def _get_detector(conf: float):
	"""Return a cached WidgetDetector instance."""
	global _detector
	if _detector is None:
	from widget_detector import WidgetDetector
	_detector = WidgetDetector(conf=conf, imgsz=640, device="cpu")
	else:
	_detector.conf = conf
	_detector.model.overrides["conf"] = conf
	return _detector


	# ─── Drawing helper ────────────────────────────────────────────────────────────
	def _draw_boxes(pil_img: Image.Image, widgets: list) -> Image.Image:
	"""Draw coloured bounding boxes + labels on a PIL image."""
	img = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2BGR)
	h, w = img.shape[:2]
	font_scale = max(0.45, w / 2200)
	thickness = max(2, w // 800)

	for widget in widgets:
	cls = widget.class_name
	conf = widget.confidence
	x1, y1, x2, y2 = (
	int(widget.bbox.x1), int(widget.bbox.y1),
	int(widget.bbox.x2), int(widget.bbox.y2),
	)
	color = CLASS_COLORS_BGR.get(cls, (128, 128, 128))

	cv2.rectangle(img, (x1, y1), (x2, y2), color, thickness)

	label = f"{cls} {conf:.0%}"
	(tw, th), baseline = cv2.getTextSize(
	label, cv2.FONT_HERSHEY_SIMPLEX, font_scale, 1
	)
	label_y = max(y1, th + baseline + 4)
	cv2.rectangle(
	img,
	(x1, label_y - th - baseline - 4),
	(x1 + tw + 4, label_y),
	color, -1,
	)
	cv2.putText(
	img, label, (x1 + 2, label_y - baseline - 2),
	cv2.FONT_HERSHEY_SIMPLEX, font_scale,
	(255, 255, 255), 1, cv2.LINE_AA,
	)

	return Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))


	# ─── Fillable PDF generator ────────────────────────────────────────────────────
	def create_fillable_pdf(state: dict \| None) -> str \| None:
	"""
	Convert detected widgets into a fillable PDF with interactive form fields:
	- text_input → PDF TextField (blue tint, typeable)
	- choice_button → PDF CheckBox (red border, clickable)
	- signature → PDF Signature (gold tint)

	Coordinate mapping: bboxes are in pixels at RENDER_DPI.
	PDF uses points (1 pt = 1/72 inch), so scale = 72 / RENDER_DPI.
	"""
	if state is None or "result" not in state:
	return None

	try:
	import fitz # PyMuPDF
	except ImportError:
	return None

	result = state["result"]
	file_path = Path(state["file_path"])
	is_pdf_ = state["is_pdf"]
	scale = 72.0 / RENDER_DPI # pixel → PDF point

	# ── Open or create the base PDF ───────────────────────────────────────────
	if is_pdf_:
	doc = fitz.open(str(file_path))
	else:
	# Build a PDF page from the image, sized to match the image pixels
	pil_img = Image.open(str(file_path)).convert("RGB")
	w_px, h_px = pil_img.size
	doc = fitz.open()
	page = doc.new_page(width=w_px * scale, height=h_px * scale)
	buf = io.BytesIO()
	pil_img.save(buf, format="PNG")
	buf.seek(0)
	page.insert_image(page.rect, stream=buf.read())

	# ── Add form widgets to each page ─────────────────────────────────────────
	for page_idx, page_result in enumerate(result.pages):
	if page_idx >= len(doc):
	break
	page = doc[page_idx]

	for i, w in enumerate(page_result.widgets):
	cls = w.class_name
	x1 = w.bbox.x1 * scale
	y1 = w.bbox.y1 * scale
	x2 = w.bbox.x2 * scale
	y2 = w.bbox.y2 * scale
	rect = fitz.Rect(x1, y1, x2, y2)

	widget = fitz.Widget()
	widget.rect = rect

	if cls == "text_input":
	widget.field_type = fitz.PDF_WIDGET_TYPE_TEXT
	widget.field_name = f"text_p{page_idx}_{i}"
	widget.field_flags = 0 # single-line
	widget.text_fontsize = 9
	widget.fill_color = (0.94, 0.97, 1.0) # light blue
	widget.border_color = (0.29, 0.56, 0.89)
	widget.border_width = 1.0

	elif cls == "choice_button":
	widget.field_type = fitz.PDF_WIDGET_TYPE_CHECKBOX
	widget.field_name = f"check_p{page_idx}_{i}"
	widget.field_value = "Off"
	widget.fill_color = (1.0, 1.0, 1.0)
	widget.border_color = (0.91, 0.30, 0.24)
	widget.border_width = 1.5

	elif cls == "signature":
	widget.field_type = fitz.PDF_WIDGET_TYPE_SIGNATURE
	widget.field_name = f"sig_p{page_idx}_{i}"
	widget.fill_color = (1.0, 0.98, 0.90) # light gold
	widget.border_color = (0.95, 0.61, 0.07)
	widget.border_width = 1.0

	else:
	continue

	page.add_widget(widget)

	# ── Save to a named temp file (Gradio serves it as download) ──────────────
	tmp = tempfile.NamedTemporaryFile(
	suffix=".pdf", delete=False, prefix="fillable_form_"
	)
	doc.save(tmp.name, garbage=4, deflate=True)
	doc.close()
	return tmp.name


	# ─── Core inference ────────────────────────────────────────────────────────────
	def run_inference(file_obj, conf: float, high_quality: bool):
	"""Main inference function called by Gradio.
	Returns: gallery, summary_md, json_str, state_dict
	"""
	if not file_obj:
	return [], "No file uploaded.", "{}", None

	detector = _get_detector(conf)
	detector.model.overrides["imgsz"] = 1024 if high_quality else 640

	file_path = Path(file_obj)

	try:
	result = detector.detect_path(str(file_path))
	except Exception as exc:
	return [], f"❌ Inference error: {exc}", "{}", None

	from widget_detector.pdf_utils import is_pdf, pdf_to_images, image_to_pil

	is_pdf_flag = is_pdf(file_path)

	if is_pdf_flag:
	source_images = [img for img, _ in pdf_to_images(file_path, dpi=RENDER_DPI)]
	else:
	source_images = [image_to_pil(file_path)]

	# ── Visualizations ────────────────────────────────────────────────────────
	gallery_images = []
	for page_result, pil_img in zip(result.pages, source_images):
	gallery_images.append(_draw_boxes(pil_img, page_result.widgets))

	# ── Summary ───────────────────────────────────────────────────────────────
	counts = {"text_input": 0, "choice_button": 0, "signature": 0}
	for page in result.pages:
	for w in page.widgets:
	counts[w.class_name] = counts.get(w.class_name, 0) + 1

	summary_lines = [
	f"### ✅ Detected {result.total_widgets} widgets across {result.total_pages} page(s)\n",
	"\| Class \| Count \|",
	"\|---\|---\|",
	]
	for cls, count in counts.items():
	emoji = CLASS_EMOJIS.get(cls, "•")
	summary_lines.append(f"\| {emoji} `{cls}` \| {count} \|")
	summary_md = "\n".join(summary_lines)

	# ── JSON ──────────────────────────────────────────────────────────────────
	json_str = json.dumps(result.model_dump(), indent=2)

	# ── State (passed to fillable PDF generator) ──────────────────────────────
	state = {
	"result": result,
	"file_path": str(file_path),
	"is_pdf": is_pdf_flag,
	}

	return gallery_images, summary_md, json_str, state


	# ─── Gradio UI ─────────────────────────────────────────────────────────────────
	DESCRIPTION = """
	<div style="text-align:center; padding: 12px 0 4px 0">
	<h1 style="font-size:2rem; margin-bottom:4px">📄 Widget Detector</h1>
	<p style="font-size:1.05rem; color:#666; margin-top:0">
	Detect form fields in scanned PDFs and document images using <b>YOLO11m</b>
	fine-tuned on the <a href="https://huggingface.co/datasets/jbarrow/CommonForms" target="_blank">CommonForms</a> dataset.
	</p>
	<p style="font-size:0.9rem; margin-top:6px">
	🟦 <code>text_input</code>  \|
	🟥 <code>choice_button</code> (checkboxes / radio)  \|
	🟨 <code>signature</code>
	</p>
	<p style="font-size:0.85rem; color:#888">
	📦 <a href="https://pypi.org/project/psynx-widget-detector/" target="_blank">pip install psynx-widget-detector</a>  \|
	🤗 <a href="https://huggingface.co/PSynx/widget-detector-yolo" target="_blank">Model Card</a>
	</p>
	</div>
	"""

	with gr.Blocks(
	title="Widget Detector Demo",
	theme=gr.themes.Soft(
	primary_hue=gr.themes.colors.blue,
	secondary_hue=gr.themes.colors.indigo,
	font=[gr.themes.GoogleFont("Inter"), "sans-serif"],
	),
	css="""
	.contain { max-width: 1100px; margin: 0 auto; }
	#output-gallery img { border-radius: 8px; }
	.summary-box { background: #f8f9ff; border-radius: 8px; padding: 12px; }
	.fillable-section { background: #f0fdf4; border-radius: 8px; padding: 12px;
	border: 1px solid #bbf7d0; margin-top: 8px; }
	footer { display: none !important; }
	""",
	) as demo:

	# Shared state between inference run and fillable PDF generation
	inference_state = gr.State(None)

	gr.HTML(DESCRIPTION)

	with gr.Row(equal_height=False):
	# ── Left column: Inputs ───────────────────────────────────────────────
	with gr.Column(scale=1, min_width=280):
	file_input = gr.File(
	label="Upload PDF or Image",
	file_types=[".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".bmp"],
	type="filepath",
	)
	conf_slider = gr.Slider(
	minimum=0.10, maximum=0.90, step=0.05,
	value=0.35, label="Confidence Threshold",
	info="Lower = more detections (may include false positives)",
	)
	hq_checkbox = gr.Checkbox(
	label="⚡ High Quality (1024px — slower on CPU)",
	value=False,
	)
	run_btn = gr.Button("🔍 Detect Widgets", variant="primary", size="lg")

	# ── Fillable PDF section ──────────────────────────────────────────
	gr.HTML("""
	<div class="fillable-section">
	<b>📥 Fillable PDF Export</b><br>
	<span style="font-size:0.85rem;color:#555">
	After detecting widgets, click below to download a fillable PDF
	with interactive text boxes, checkboxes, and signature fields
	placed exactly over the detected widget locations.
	</span>
	</div>
	""")
	pdf_btn = gr.Button(
	"📥 Download Fillable PDF",
	variant="secondary",
	size="lg",
	interactive=False,
	)
	pdf_output = gr.File(
	label="Fillable PDF",
	visible=False,
	)

	gr.Examples(
	examples=[
	["examples/tt.pdf", 0.35, False],
	["examples/mvatform1.pdf", 0.35, False],
	["examples/new.pdf", 0.35, False],
	],
	inputs=[file_input, conf_slider, hq_checkbox],
	label="📂 Example Files (click to load)",
	)

	# ── Right column: Outputs ─────────────────────────────────────────────
	with gr.Column(scale=2):
	with gr.Tabs():
	with gr.TabItem("🖼️ Visual Output"):
	summary_md = gr.Markdown(
	"Upload a file and click Detect Widgets to see results.",
	elem_classes=["summary-box"],
	)
	gallery = gr.Gallery(
	label="Detected Widgets",
	elem_id="output-gallery",
	columns=1,
	object_fit="contain",
	height=700,
	show_label=False,
	)

	with gr.TabItem("{ } JSON Output"):
	gr.Markdown(
	"The raw JSON response — copy this to integrate the detector into your own app.",
	elem_classes=["summary-box"],
	)
	json_output = gr.Code(
	language="json",
	label="Detection Result",
	lines=35,
	interactive=False,
	)

	# ── Inference click ───────────────────────────────────────────────────────
	run_btn.click(
	fn=run_inference,
	inputs=[file_input, conf_slider, hq_checkbox],
	outputs=[gallery, summary_md, json_output, inference_state],
	).then(
	# Enable the PDF button after successful inference
	fn=lambda state: gr.update(interactive=state is not None),
	inputs=[inference_state],
	outputs=[pdf_btn],
	)

	# ── Fillable PDF click ────────────────────────────────────────────────────
	pdf_btn.click(
	fn=create_fillable_pdf,
	inputs=[inference_state],
	outputs=[pdf_output],
	).then(
	fn=lambda f: gr.update(visible=f is not None),
	inputs=[pdf_output],
	outputs=[pdf_output],
	)

	if __name__ == "__main__":
	demo.launch()