Spaces:
Paused
Paused
| import subprocess, sys | |
| subprocess.run([sys.executable, "-m", "pip", "install", "docling", "-q"], check=False) | |
| import json | |
| import sys | |
| from pathlib import Path | |
| import gradio as gr | |
| sys.path.insert(0, str(Path(__file__).parent / "pipeline")) | |
| from prelabeler import process_pdf, LABELS, LABEL_COLORS, detect_doc_type | |
| DOC_TYPES = ["auto", "math", "newspaper", "research", "presentation"] | |
| def run_prelabel(pdf_files, doc_type_choice): | |
| if not pdf_files: | |
| return "No files uploaded.", "{}" | |
| doc_type = None if doc_type_choice == "auto" else doc_type_choice | |
| all_results = [] | |
| for pdf_path in pdf_files: | |
| result = process_pdf(pdf_path, doc_type) | |
| all_results.append(result) | |
| summary_lines = [] | |
| for r in all_results: | |
| summary_lines.append(f"📄 **{r['file']}** ({r.get('doc_type','auto')}) — {r['pages']} page(s)") | |
| if r.get("error") and r["error"] != "docling_not_installed": | |
| summary_lines.append(f" ⚠️ Error: {r['error']}") | |
| else: | |
| if r.get("error") == "docling_not_installed": | |
| summary_lines.append(" ⚠️ Running in demo mode — Docling still installing, try again in 2 mins") | |
| counts = r.get("label_counts", {}) | |
| for lbl in LABELS: | |
| if lbl in counts and counts[lbl] > 0: | |
| summary_lines.append(f" • **{lbl}**: {counts[lbl]} region(s)") | |
| summary_lines.append("") | |
| summary = "\n".join(summary_lines) | |
| json_out = json.dumps(all_results, indent=2) | |
| return summary, json_out | |
| def score_gold(pred_json_text, gold_json_text): | |
| try: | |
| preds = json.loads(pred_json_text) | |
| if isinstance(preds, list) and preds and "regions" in preds[0]: | |
| regions = [] | |
| for r in preds: regions.extend(r.get("regions", [])) | |
| else: | |
| regions = preds | |
| gold = json.loads(gold_json_text) | |
| if isinstance(gold, dict) and "regions" in gold: | |
| gold = gold["regions"] | |
| except Exception as e: | |
| return f"⚠️ JSON parse error: {e}" | |
| from prelabeler import score_against_gold | |
| m = score_against_gold(regions, gold) | |
| ov = m["overall"] | |
| lines = [ | |
| f"## Overall: P={ov['precision']:.1%} R={ov['recall']:.1%} F1={ov['f1']:.1%}", | |
| "", | |
| "| Label | Precision | Recall | F1 |", | |
| "|---|---|---|---|", | |
| ] | |
| for lbl in LABELS: | |
| d = m["per_label"].get(lbl, {}) | |
| if d.get("f1", 0) > 0 or d.get("precision", 0) > 0: | |
| lines.append(f"| {lbl} | {d['precision']:.1%} | {d['recall']:.1%} | {d['f1']:.1%} |") | |
| return "\n".join(lines) | |
| with gr.Blocks( | |
| title="PreLabel · Document Layout Annotator", | |
| theme=gr.themes.Soft(primary_hue="blue"), | |
| ) as demo: | |
| gr.Markdown(""" | |
| # 🏷️ PreLabel — Document Layout Pre-Labeling | |
| **iMerit Annotation Pipeline** · 10 label classes · Math · Newspaper · Presentation · Research | |
| Automatically pre-labels PDF regions as: `Text` · `Equation` · `Title` · `Section Header` · `Table` · `Caption` · `Figure` · `Page Number` · `Footer` · `Page-footer` | |
| """) | |
| with gr.Tabs(): | |
| with gr.Tab("📄 Pre-Label PDFs"): | |
| with gr.Row(): | |
| with gr.Column(scale=1): | |
| pdf_input = gr.File( | |
| label="Upload PDF files", | |
| file_types=[".pdf"], | |
| file_count="multiple", | |
| ) | |
| doc_type_input = gr.Dropdown( | |
| choices=DOC_TYPES, | |
| value="auto", | |
| label="Document type", | |
| info="'auto' detects from filename. Override if needed." | |
| ) | |
| run_btn = gr.Button("▶ Run Pre-Labeling", variant="primary", size="lg") | |
| with gr.Column(scale=2): | |
| summary_out = gr.Markdown(value="*Results will appear here after running.*") | |
| gr.Markdown("### 📥 JSON output") | |
| json_out = gr.Code(label="Pre-label JSON", language="json", lines=20) | |
| run_btn.click(fn=run_prelabel, inputs=[pdf_input, doc_type_input], outputs=[summary_out, json_out]) | |
| with gr.Tab("📊 Score Against Gold"): | |
| gr.Markdown(""" | |
| Paste your pre-label JSON and gold JSON to compute Precision / Recall / F1. | |
| **Gold JSON format:** | |
| ```json | |
| [{"label": "Text", "bbox": {"x0":72,"y0":110,"x1":540,"y1":200}, "page": 1}] | |
| ``` | |
| """) | |
| with gr.Row(): | |
| pred_input = gr.Code(label="Predictions JSON", language="json", lines=12) | |
| gold_input = gr.Code(label="Gold JSON", language="json", lines=12) | |
| score_btn = gr.Button("📊 Compute Metrics", variant="primary") | |
| metrics_out = gr.Markdown() | |
| score_btn.click(fn=score_gold, inputs=[pred_input, gold_input], outputs=metrics_out) | |
| with gr.Tab("📖 Label Guide"): | |
| gr.Markdown(""" | |
| ## Label schema | |
| | Label | Description | | |
| |---|---| | |
| | **Text** | Body text, paragraphs, bullet points, list items | | |
| | **Equation** | Standalone math formulas (inline equations in math/research docs are ignored) | | |
| | **Title** | Document or article title | | |
| | **Section Header** | Section/subsection headings | | |
| | **Table** | Data tables (not captions) | | |
| | **Caption** | Figure/table captions — starts with Fig., Table., etc. | | |
| | **Figure** | Images, charts, diagrams, illustrations | | |
| | **Page Number** | Standalone page numbers | | |
| | **Footer** | Copyright lines, source notes at bottom | | |
| | **Page-footer** | Running footer with page number / journal reference | | |
| ## Guidelines applied | |
| - **Math & Research:** inline equations inside text are not labeled — only standalone equations | |
| - **Newspapers:** multiple paragraphs in the same column/section can share one bounding box | |
| - **Footer vs Page-footer:** these are two distinct labels, never merged | |
| """) | |
| demo.launch() | |