""" Widget Detector — Hugging Face Spaces Demo ========================================== Drag-and-drop a PDF or image to detect form widgets (text inputs, checkboxes, signatures) using YOLO11m fine-tuned on CommonForms. Features: - Visual bounding box overlay (Tab 1) - Raw JSON output for developers (Tab 2) - Download Fillable PDF — converts detections into interactive PDF form fields """ from __future__ import annotations import io import json import tempfile from pathlib import Path import cv2 import gradio as gr import numpy as np from PIL import Image # ─── Colour palette ──────────────────────────────────────────────────────────── CLASS_COLORS_BGR = { "text_input": (217, 144, 74), # blue (#4A90D9 → BGR) "choice_button": (60, 76, 231), # red (#E74C3C → BGR) "signature": (18, 156, 243), # gold (#F39C12 → BGR) } CLASS_EMOJIS = { "text_input": "🟦", "choice_button": "🟥", "signature": "🟨", } # Render DPI used for both visualization and fillable PDF coordinate mapping RENDER_DPI = 200 # ─── Global model (loaded once per worker) ───────────────────────────────────── _detector = None def _get_detector(conf: float): """Return a cached WidgetDetector instance.""" global _detector if _detector is None: from widget_detector import WidgetDetector _detector = WidgetDetector(conf=conf, imgsz=640, device="cpu") else: _detector.conf = conf _detector.model.overrides["conf"] = conf return _detector # ─── Drawing helper ──────────────────────────────────────────────────────────── def _draw_boxes(pil_img: Image.Image, widgets: list) -> Image.Image: """Draw coloured bounding boxes + labels on a PIL image.""" img = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2BGR) h, w = img.shape[:2] font_scale = max(0.45, w / 2200) thickness = max(2, w // 800) for widget in widgets: cls = widget.class_name conf = widget.confidence x1, y1, x2, y2 = ( int(widget.bbox.x1), int(widget.bbox.y1), int(widget.bbox.x2), int(widget.bbox.y2), ) color = CLASS_COLORS_BGR.get(cls, (128, 128, 128)) cv2.rectangle(img, (x1, y1), (x2, y2), color, thickness) label = f"{cls} {conf:.0%}" (tw, th), baseline = cv2.getTextSize( label, cv2.FONT_HERSHEY_SIMPLEX, font_scale, 1 ) label_y = max(y1, th + baseline + 4) cv2.rectangle( img, (x1, label_y - th - baseline - 4), (x1 + tw + 4, label_y), color, -1, ) cv2.putText( img, label, (x1 + 2, label_y - baseline - 2), cv2.FONT_HERSHEY_SIMPLEX, font_scale, (255, 255, 255), 1, cv2.LINE_AA, ) return Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB)) # ─── Fillable PDF generator ──────────────────────────────────────────────────── def create_fillable_pdf(state: dict | None) -> str | None: """ Convert detected widgets into a fillable PDF with interactive form fields: - text_input → PDF TextField (blue tint, typeable) - choice_button → PDF CheckBox (red border, clickable) - signature → PDF Signature (gold tint) Coordinate mapping: bboxes are in pixels at RENDER_DPI. PDF uses points (1 pt = 1/72 inch), so scale = 72 / RENDER_DPI. """ if state is None or "result" not in state: return None try: import fitz # PyMuPDF except ImportError: return None result = state["result"] file_path = Path(state["file_path"]) is_pdf_ = state["is_pdf"] scale = 72.0 / RENDER_DPI # pixel → PDF point # ── Open or create the base PDF ─────────────────────────────────────────── if is_pdf_: doc = fitz.open(str(file_path)) else: # Build a PDF page from the image, sized to match the image pixels pil_img = Image.open(str(file_path)).convert("RGB") w_px, h_px = pil_img.size doc = fitz.open() page = doc.new_page(width=w_px * scale, height=h_px * scale) buf = io.BytesIO() pil_img.save(buf, format="PNG") buf.seek(0) page.insert_image(page.rect, stream=buf.read()) # ── Add form widgets to each page ───────────────────────────────────────── for page_idx, page_result in enumerate(result.pages): if page_idx >= len(doc): break page = doc[page_idx] for i, w in enumerate(page_result.widgets): cls = w.class_name x1 = w.bbox.x1 * scale y1 = w.bbox.y1 * scale x2 = w.bbox.x2 * scale y2 = w.bbox.y2 * scale rect = fitz.Rect(x1, y1, x2, y2) widget = fitz.Widget() widget.rect = rect if cls == "text_input": widget.field_type = fitz.PDF_WIDGET_TYPE_TEXT widget.field_name = f"text_p{page_idx}_{i}" widget.field_flags = 0 # single-line widget.text_fontsize = 9 widget.fill_color = (0.94, 0.97, 1.0) # light blue widget.border_color = (0.29, 0.56, 0.89) widget.border_width = 1.0 elif cls == "choice_button": widget.field_type = fitz.PDF_WIDGET_TYPE_CHECKBOX widget.field_name = f"check_p{page_idx}_{i}" widget.field_value = "Off" widget.fill_color = (1.0, 1.0, 1.0) widget.border_color = (0.91, 0.30, 0.24) widget.border_width = 1.5 elif cls == "signature": widget.field_type = fitz.PDF_WIDGET_TYPE_SIGNATURE widget.field_name = f"sig_p{page_idx}_{i}" widget.fill_color = (1.0, 0.98, 0.90) # light gold widget.border_color = (0.95, 0.61, 0.07) widget.border_width = 1.0 else: continue page.add_widget(widget) # ── Save to a named temp file (Gradio serves it as download) ────────────── tmp = tempfile.NamedTemporaryFile( suffix=".pdf", delete=False, prefix="fillable_form_" ) doc.save(tmp.name, garbage=4, deflate=True) doc.close() return tmp.name # ─── Core inference ──────────────────────────────────────────────────────────── def run_inference(file_obj, conf: float, high_quality: bool): """Main inference function called by Gradio. Returns: gallery, summary_md, json_str, state_dict """ if not file_obj: return [], "No file uploaded.", "{}", None detector = _get_detector(conf) detector.model.overrides["imgsz"] = 1024 if high_quality else 640 file_path = Path(file_obj) try: result = detector.detect_path(str(file_path)) except Exception as exc: return [], f"❌ Inference error: {exc}", "{}", None from widget_detector.pdf_utils import is_pdf, pdf_to_images, image_to_pil is_pdf_flag = is_pdf(file_path) if is_pdf_flag: source_images = [img for img, _ in pdf_to_images(file_path, dpi=RENDER_DPI)] else: source_images = [image_to_pil(file_path)] # ── Visualizations ──────────────────────────────────────────────────────── gallery_images = [] for page_result, pil_img in zip(result.pages, source_images): gallery_images.append(_draw_boxes(pil_img, page_result.widgets)) # ── Summary ─────────────────────────────────────────────────────────────── counts = {"text_input": 0, "choice_button": 0, "signature": 0} for page in result.pages: for w in page.widgets: counts[w.class_name] = counts.get(w.class_name, 0) + 1 summary_lines = [ f"### ✅ Detected **{result.total_widgets}** widgets across **{result.total_pages}** page(s)\n", "| Class | Count |", "|---|---|", ] for cls, count in counts.items(): emoji = CLASS_EMOJIS.get(cls, "•") summary_lines.append(f"| {emoji} `{cls}` | **{count}** |") summary_md = "\n".join(summary_lines) # ── JSON ────────────────────────────────────────────────────────────────── json_str = json.dumps(result.model_dump(), indent=2) # ── State (passed to fillable PDF generator) ────────────────────────────── state = { "result": result, "file_path": str(file_path), "is_pdf": is_pdf_flag, } return gallery_images, summary_md, json_str, state # ─── Gradio UI ───────────────────────────────────────────────────────────────── DESCRIPTION = """
Detect form fields in scanned PDFs and document images using YOLO11m fine-tuned on the CommonForms dataset.
🟦 text_input |
🟥 choice_button (checkboxes / radio) |
🟨 signature