import subprocess, sys subprocess.run([sys.executable, "-m", "pip", "install", "docling", "-q"], check=False) import json import sys from pathlib import Path import gradio as gr sys.path.insert(0, str(Path(__file__).parent / "pipeline")) from prelabeler import process_pdf, LABELS, LABEL_COLORS, detect_doc_type DOC_TYPES = ["auto", "math", "newspaper", "research", "presentation"] def run_prelabel(pdf_files, doc_type_choice): if not pdf_files: return "No files uploaded.", "{}" doc_type = None if doc_type_choice == "auto" else doc_type_choice all_results = [] for pdf_path in pdf_files: result = process_pdf(pdf_path, doc_type) all_results.append(result) summary_lines = [] for r in all_results: summary_lines.append(f"📄 **{r['file']}** ({r.get('doc_type','auto')}) — {r['pages']} page(s)") if r.get("error") and r["error"] != "docling_not_installed": summary_lines.append(f" ⚠️ Error: {r['error']}") else: if r.get("error") == "docling_not_installed": summary_lines.append(" ⚠️ Running in demo mode — Docling still installing, try again in 2 mins") counts = r.get("label_counts", {}) for lbl in LABELS: if lbl in counts and counts[lbl] > 0: summary_lines.append(f" • **{lbl}**: {counts[lbl]} region(s)") summary_lines.append("") summary = "\n".join(summary_lines) json_out = json.dumps(all_results, indent=2) return summary, json_out def score_gold(pred_json_text, gold_json_text): try: preds = json.loads(pred_json_text) if isinstance(preds, list) and preds and "regions" in preds[0]: regions = [] for r in preds: regions.extend(r.get("regions", [])) else: regions = preds gold = json.loads(gold_json_text) if isinstance(gold, dict) and "regions" in gold: gold = gold["regions"] except Exception as e: return f"⚠️ JSON parse error: {e}" from prelabeler import score_against_gold m = score_against_gold(regions, gold) ov = m["overall"] lines = [ f"## Overall: P={ov['precision']:.1%} R={ov['recall']:.1%} F1={ov['f1']:.1%}", "", "| Label | Precision | Recall | F1 |", "|---|---|---|---|", ] for lbl in LABELS: d = m["per_label"].get(lbl, {}) if d.get("f1", 0) > 0 or d.get("precision", 0) > 0: lines.append(f"| {lbl} | {d['precision']:.1%} | {d['recall']:.1%} | {d['f1']:.1%} |") return "\n".join(lines) with gr.Blocks( title="PreLabel · Document Layout Annotator", theme=gr.themes.Soft(primary_hue="blue"), ) as demo: gr.Markdown(""" # 🏷️ PreLabel — Document Layout Pre-Labeling **iMerit Annotation Pipeline** · 10 label classes · Math · Newspaper · Presentation · Research Automatically pre-labels PDF regions as: `Text` · `Equation` · `Title` · `Section Header` · `Table` · `Caption` · `Figure` · `Page Number` · `Footer` · `Page-footer` """) with gr.Tabs(): with gr.Tab("📄 Pre-Label PDFs"): with gr.Row(): with gr.Column(scale=1): pdf_input = gr.File( label="Upload PDF files", file_types=[".pdf"], file_count="multiple", ) doc_type_input = gr.Dropdown( choices=DOC_TYPES, value="auto", label="Document type", info="'auto' detects from filename. Override if needed." ) run_btn = gr.Button("▶ Run Pre-Labeling", variant="primary", size="lg") with gr.Column(scale=2): summary_out = gr.Markdown(value="*Results will appear here after running.*") gr.Markdown("### 📥 JSON output") json_out = gr.Code(label="Pre-label JSON", language="json", lines=20) run_btn.click(fn=run_prelabel, inputs=[pdf_input, doc_type_input], outputs=[summary_out, json_out]) with gr.Tab("📊 Score Against Gold"): gr.Markdown(""" Paste your pre-label JSON and gold JSON to compute Precision / Recall / F1. **Gold JSON format:** ```json [{"label": "Text", "bbox": {"x0":72,"y0":110,"x1":540,"y1":200}, "page": 1}] ``` """) with gr.Row(): pred_input = gr.Code(label="Predictions JSON", language="json", lines=12) gold_input = gr.Code(label="Gold JSON", language="json", lines=12) score_btn = gr.Button("📊 Compute Metrics", variant="primary") metrics_out = gr.Markdown() score_btn.click(fn=score_gold, inputs=[pred_input, gold_input], outputs=metrics_out) with gr.Tab("📖 Label Guide"): gr.Markdown(""" ## Label schema | Label | Description | |---|---| | **Text** | Body text, paragraphs, bullet points, list items | | **Equation** | Standalone math formulas (inline equations in math/research docs are ignored) | | **Title** | Document or article title | | **Section Header** | Section/subsection headings | | **Table** | Data tables (not captions) | | **Caption** | Figure/table captions — starts with Fig., Table., etc. | | **Figure** | Images, charts, diagrams, illustrations | | **Page Number** | Standalone page numbers | | **Footer** | Copyright lines, source notes at bottom | | **Page-footer** | Running footer with page number / journal reference | ## Guidelines applied - **Math & Research:** inline equations inside text are not labeled — only standalone equations - **Newspapers:** multiple paragraphs in the same column/section can share one bounding box - **Footer vs Page-footer:** these are two distinct labels, never merged """) demo.launch()