PSynx's picture
Upload app.py with huggingface_hub
9a9783f verified
"""
Widget Detector β€” Hugging Face Spaces Demo
==========================================
Drag-and-drop a PDF or image to detect form widgets (text inputs,
checkboxes, signatures) using YOLO11m fine-tuned on CommonForms.
Features:
- Visual bounding box overlay (Tab 1)
- Raw JSON output for developers (Tab 2)
- Download Fillable PDF β€” converts detections into interactive PDF form fields
"""
from __future__ import annotations
import io
import json
import tempfile
from pathlib import Path
import cv2
import gradio as gr
import numpy as np
from PIL import Image
# ─── Colour palette ────────────────────────────────────────────────────────────
CLASS_COLORS_BGR = {
"text_input": (217, 144, 74), # blue (#4A90D9 β†’ BGR)
"choice_button": (60, 76, 231), # red (#E74C3C β†’ BGR)
"signature": (18, 156, 243), # gold (#F39C12 β†’ BGR)
}
CLASS_EMOJIS = {
"text_input": "🟦",
"choice_button": "πŸŸ₯",
"signature": "🟨",
}
# Render DPI used for both visualization and fillable PDF coordinate mapping
RENDER_DPI = 200
# ─── Global model (loaded once per worker) ─────────────────────────────────────
_detector = None
def _get_detector(conf: float):
"""Return a cached WidgetDetector instance."""
global _detector
if _detector is None:
from widget_detector import WidgetDetector
_detector = WidgetDetector(conf=conf, imgsz=640, device="cpu")
else:
_detector.conf = conf
_detector.model.overrides["conf"] = conf
return _detector
# ─── Drawing helper ────────────────────────────────────────────────────────────
def _draw_boxes(pil_img: Image.Image, widgets: list) -> Image.Image:
"""Draw coloured bounding boxes + labels on a PIL image."""
img = cv2.cvtColor(np.array(pil_img), cv2.COLOR_RGB2BGR)
h, w = img.shape[:2]
font_scale = max(0.45, w / 2200)
thickness = max(2, w // 800)
for widget in widgets:
cls = widget.class_name
conf = widget.confidence
x1, y1, x2, y2 = (
int(widget.bbox.x1), int(widget.bbox.y1),
int(widget.bbox.x2), int(widget.bbox.y2),
)
color = CLASS_COLORS_BGR.get(cls, (128, 128, 128))
cv2.rectangle(img, (x1, y1), (x2, y2), color, thickness)
label = f"{cls} {conf:.0%}"
(tw, th), baseline = cv2.getTextSize(
label, cv2.FONT_HERSHEY_SIMPLEX, font_scale, 1
)
label_y = max(y1, th + baseline + 4)
cv2.rectangle(
img,
(x1, label_y - th - baseline - 4),
(x1 + tw + 4, label_y),
color, -1,
)
cv2.putText(
img, label, (x1 + 2, label_y - baseline - 2),
cv2.FONT_HERSHEY_SIMPLEX, font_scale,
(255, 255, 255), 1, cv2.LINE_AA,
)
return Image.fromarray(cv2.cvtColor(img, cv2.COLOR_BGR2RGB))
# ─── Fillable PDF generator ────────────────────────────────────────────────────
def create_fillable_pdf(state: dict | None) -> str | None:
"""
Convert detected widgets into a fillable PDF with interactive form fields:
- text_input β†’ PDF TextField (blue tint, typeable)
- choice_button β†’ PDF CheckBox (red border, clickable)
- signature β†’ PDF Signature (gold tint)
Coordinate mapping: bboxes are in pixels at RENDER_DPI.
PDF uses points (1 pt = 1/72 inch), so scale = 72 / RENDER_DPI.
"""
if state is None or "result" not in state:
return None
try:
import fitz # PyMuPDF
except ImportError:
return None
result = state["result"]
file_path = Path(state["file_path"])
is_pdf_ = state["is_pdf"]
scale = 72.0 / RENDER_DPI # pixel β†’ PDF point
# ── Open or create the base PDF ───────────────────────────────────────────
if is_pdf_:
doc = fitz.open(str(file_path))
else:
# Build a PDF page from the image, sized to match the image pixels
pil_img = Image.open(str(file_path)).convert("RGB")
w_px, h_px = pil_img.size
doc = fitz.open()
page = doc.new_page(width=w_px * scale, height=h_px * scale)
buf = io.BytesIO()
pil_img.save(buf, format="PNG")
buf.seek(0)
page.insert_image(page.rect, stream=buf.read())
# ── Add form widgets to each page ─────────────────────────────────────────
for page_idx, page_result in enumerate(result.pages):
if page_idx >= len(doc):
break
page = doc[page_idx]
for i, w in enumerate(page_result.widgets):
cls = w.class_name
x1 = w.bbox.x1 * scale
y1 = w.bbox.y1 * scale
x2 = w.bbox.x2 * scale
y2 = w.bbox.y2 * scale
rect = fitz.Rect(x1, y1, x2, y2)
widget = fitz.Widget()
widget.rect = rect
if cls == "text_input":
widget.field_type = fitz.PDF_WIDGET_TYPE_TEXT
widget.field_name = f"text_p{page_idx}_{i}"
widget.field_flags = 0 # single-line
widget.text_fontsize = 9
widget.fill_color = (0.94, 0.97, 1.0) # light blue
widget.border_color = (0.29, 0.56, 0.89)
widget.border_width = 1.0
elif cls == "choice_button":
widget.field_type = fitz.PDF_WIDGET_TYPE_CHECKBOX
widget.field_name = f"check_p{page_idx}_{i}"
widget.field_value = "Off"
widget.fill_color = (1.0, 1.0, 1.0)
widget.border_color = (0.91, 0.30, 0.24)
widget.border_width = 1.5
elif cls == "signature":
widget.field_type = fitz.PDF_WIDGET_TYPE_SIGNATURE
widget.field_name = f"sig_p{page_idx}_{i}"
widget.fill_color = (1.0, 0.98, 0.90) # light gold
widget.border_color = (0.95, 0.61, 0.07)
widget.border_width = 1.0
else:
continue
page.add_widget(widget)
# ── Save to a named temp file (Gradio serves it as download) ──────────────
tmp = tempfile.NamedTemporaryFile(
suffix=".pdf", delete=False, prefix="fillable_form_"
)
doc.save(tmp.name, garbage=4, deflate=True)
doc.close()
return tmp.name
# ─── Core inference ────────────────────────────────────────────────────────────
def run_inference(file_obj, conf: float, high_quality: bool):
"""Main inference function called by Gradio.
Returns: gallery, summary_md, json_str, state_dict
"""
if not file_obj:
return [], "No file uploaded.", "{}", None
detector = _get_detector(conf)
detector.model.overrides["imgsz"] = 1024 if high_quality else 640
file_path = Path(file_obj)
try:
result = detector.detect_path(str(file_path))
except Exception as exc:
return [], f"❌ Inference error: {exc}", "{}", None
from widget_detector.pdf_utils import is_pdf, pdf_to_images, image_to_pil
is_pdf_flag = is_pdf(file_path)
if is_pdf_flag:
source_images = [img for img, _ in pdf_to_images(file_path, dpi=RENDER_DPI)]
else:
source_images = [image_to_pil(file_path)]
# ── Visualizations ────────────────────────────────────────────────────────
gallery_images = []
for page_result, pil_img in zip(result.pages, source_images):
gallery_images.append(_draw_boxes(pil_img, page_result.widgets))
# ── Summary ───────────────────────────────────────────────────────────────
counts = {"text_input": 0, "choice_button": 0, "signature": 0}
for page in result.pages:
for w in page.widgets:
counts[w.class_name] = counts.get(w.class_name, 0) + 1
summary_lines = [
f"### βœ… Detected **{result.total_widgets}** widgets across **{result.total_pages}** page(s)\n",
"| Class | Count |",
"|---|---|",
]
for cls, count in counts.items():
emoji = CLASS_EMOJIS.get(cls, "β€’")
summary_lines.append(f"| {emoji} `{cls}` | **{count}** |")
summary_md = "\n".join(summary_lines)
# ── JSON ──────────────────────────────────────────────────────────────────
json_str = json.dumps(result.model_dump(), indent=2)
# ── State (passed to fillable PDF generator) ──────────────────────────────
state = {
"result": result,
"file_path": str(file_path),
"is_pdf": is_pdf_flag,
}
return gallery_images, summary_md, json_str, state
# ─── Gradio UI ─────────────────────────────────────────────────────────────────
DESCRIPTION = """
<div style="text-align:center; padding: 12px 0 4px 0">
<h1 style="font-size:2rem; margin-bottom:4px">πŸ“„ Widget Detector</h1>
<p style="font-size:1.05rem; color:#666; margin-top:0">
Detect form fields in scanned PDFs and document images using <b>YOLO11m</b>
fine-tuned on the <a href="https://huggingface.co/datasets/jbarrow/CommonForms" target="_blank">CommonForms</a> dataset.
</p>
<p style="font-size:0.9rem; margin-top:6px">
🟦 <code>text_input</code> &nbsp;|&nbsp;
πŸŸ₯ <code>choice_button</code> (checkboxes / radio) &nbsp;|&nbsp;
🟨 <code>signature</code>
</p>
<p style="font-size:0.85rem; color:#888">
πŸ“¦ <a href="https://pypi.org/project/psynx-widget-detector/" target="_blank">pip install psynx-widget-detector</a> &nbsp;|&nbsp;
πŸ€— <a href="https://huggingface.co/PSynx/widget-detector-yolo" target="_blank">Model Card</a>
</p>
</div>
"""
with gr.Blocks(
title="Widget Detector Demo",
theme=gr.themes.Soft(
primary_hue=gr.themes.colors.blue,
secondary_hue=gr.themes.colors.indigo,
font=[gr.themes.GoogleFont("Inter"), "sans-serif"],
),
css="""
.contain { max-width: 1100px; margin: 0 auto; }
#output-gallery img { border-radius: 8px; }
.summary-box { background: #f8f9ff; border-radius: 8px; padding: 12px; }
.fillable-section { background: #f0fdf4; border-radius: 8px; padding: 12px;
border: 1px solid #bbf7d0; margin-top: 8px; }
footer { display: none !important; }
""",
) as demo:
# Shared state between inference run and fillable PDF generation
inference_state = gr.State(None)
gr.HTML(DESCRIPTION)
with gr.Row(equal_height=False):
# ── Left column: Inputs ───────────────────────────────────────────────
with gr.Column(scale=1, min_width=280):
file_input = gr.File(
label="Upload PDF or Image",
file_types=[".pdf", ".png", ".jpg", ".jpeg", ".tiff", ".bmp"],
type="filepath",
)
conf_slider = gr.Slider(
minimum=0.10, maximum=0.90, step=0.05,
value=0.35, label="Confidence Threshold",
info="Lower = more detections (may include false positives)",
)
hq_checkbox = gr.Checkbox(
label="⚑ High Quality (1024px β€” slower on CPU)",
value=False,
)
run_btn = gr.Button("πŸ” Detect Widgets", variant="primary", size="lg")
# ── Fillable PDF section ──────────────────────────────────────────
gr.HTML("""
<div class="fillable-section">
<b>πŸ“₯ Fillable PDF Export</b><br>
<span style="font-size:0.85rem;color:#555">
After detecting widgets, click below to download a fillable PDF
with interactive text boxes, checkboxes, and signature fields
placed exactly over the detected widget locations.
</span>
</div>
""")
pdf_btn = gr.Button(
"πŸ“₯ Download Fillable PDF",
variant="secondary",
size="lg",
interactive=False,
)
pdf_output = gr.File(
label="Fillable PDF",
visible=False,
)
gr.Examples(
examples=[
["examples/tt.pdf", 0.35, False],
["examples/mvatform1.pdf", 0.35, False],
["examples/new.pdf", 0.35, False],
],
inputs=[file_input, conf_slider, hq_checkbox],
label="πŸ“‚ Example Files (click to load)",
)
# ── Right column: Outputs ─────────────────────────────────────────────
with gr.Column(scale=2):
with gr.Tabs():
with gr.TabItem("πŸ–ΌοΈ Visual Output"):
summary_md = gr.Markdown(
"Upload a file and click **Detect Widgets** to see results.",
elem_classes=["summary-box"],
)
gallery = gr.Gallery(
label="Detected Widgets",
elem_id="output-gallery",
columns=1,
object_fit="contain",
height=700,
show_label=False,
)
with gr.TabItem("{ } JSON Output"):
gr.Markdown(
"The raw JSON response β€” copy this to integrate the detector into your own app.",
elem_classes=["summary-box"],
)
json_output = gr.Code(
language="json",
label="Detection Result",
lines=35,
interactive=False,
)
# ── Inference click ───────────────────────────────────────────────────────
run_btn.click(
fn=run_inference,
inputs=[file_input, conf_slider, hq_checkbox],
outputs=[gallery, summary_md, json_output, inference_state],
).then(
# Enable the PDF button after successful inference
fn=lambda state: gr.update(interactive=state is not None),
inputs=[inference_state],
outputs=[pdf_btn],
)
# ── Fillable PDF click ────────────────────────────────────────────────────
pdf_btn.click(
fn=create_fillable_pdf,
inputs=[inference_state],
outputs=[pdf_output],
).then(
fn=lambda f: gr.update(visible=f is not None),
inputs=[pdf_output],
outputs=[pdf_output],
)
if __name__ == "__main__":
demo.launch()