prelabel / app.py
brodowski7784's picture
Upload app.py
124725f verified
import subprocess, sys
subprocess.run([sys.executable, "-m", "pip", "install", "docling", "-q"], check=False)
import json
import sys
from pathlib import Path
import gradio as gr
sys.path.insert(0, str(Path(__file__).parent / "pipeline"))
from prelabeler import process_pdf, LABELS, LABEL_COLORS, detect_doc_type
DOC_TYPES = ["auto", "math", "newspaper", "research", "presentation"]
def run_prelabel(pdf_files, doc_type_choice):
if not pdf_files:
return "No files uploaded.", "{}"
doc_type = None if doc_type_choice == "auto" else doc_type_choice
all_results = []
for pdf_path in pdf_files:
result = process_pdf(pdf_path, doc_type)
all_results.append(result)
summary_lines = []
for r in all_results:
summary_lines.append(f"📄 **{r['file']}** ({r.get('doc_type','auto')}) — {r['pages']} page(s)")
if r.get("error") and r["error"] != "docling_not_installed":
summary_lines.append(f" ⚠️ Error: {r['error']}")
else:
if r.get("error") == "docling_not_installed":
summary_lines.append(" ⚠️ Running in demo mode — Docling still installing, try again in 2 mins")
counts = r.get("label_counts", {})
for lbl in LABELS:
if lbl in counts and counts[lbl] > 0:
summary_lines.append(f" • **{lbl}**: {counts[lbl]} region(s)")
summary_lines.append("")
summary = "\n".join(summary_lines)
json_out = json.dumps(all_results, indent=2)
return summary, json_out
def score_gold(pred_json_text, gold_json_text):
try:
preds = json.loads(pred_json_text)
if isinstance(preds, list) and preds and "regions" in preds[0]:
regions = []
for r in preds: regions.extend(r.get("regions", []))
else:
regions = preds
gold = json.loads(gold_json_text)
if isinstance(gold, dict) and "regions" in gold:
gold = gold["regions"]
except Exception as e:
return f"⚠️ JSON parse error: {e}"
from prelabeler import score_against_gold
m = score_against_gold(regions, gold)
ov = m["overall"]
lines = [
f"## Overall: P={ov['precision']:.1%} R={ov['recall']:.1%} F1={ov['f1']:.1%}",
"",
"| Label | Precision | Recall | F1 |",
"|---|---|---|---|",
]
for lbl in LABELS:
d = m["per_label"].get(lbl, {})
if d.get("f1", 0) > 0 or d.get("precision", 0) > 0:
lines.append(f"| {lbl} | {d['precision']:.1%} | {d['recall']:.1%} | {d['f1']:.1%} |")
return "\n".join(lines)
with gr.Blocks(
title="PreLabel · Document Layout Annotator",
theme=gr.themes.Soft(primary_hue="blue"),
) as demo:
gr.Markdown("""
# 🏷️ PreLabel — Document Layout Pre-Labeling
**iMerit Annotation Pipeline** · 10 label classes · Math · Newspaper · Presentation · Research
Automatically pre-labels PDF regions as: `Text` · `Equation` · `Title` · `Section Header` · `Table` · `Caption` · `Figure` · `Page Number` · `Footer` · `Page-footer`
""")
with gr.Tabs():
with gr.Tab("📄 Pre-Label PDFs"):
with gr.Row():
with gr.Column(scale=1):
pdf_input = gr.File(
label="Upload PDF files",
file_types=[".pdf"],
file_count="multiple",
)
doc_type_input = gr.Dropdown(
choices=DOC_TYPES,
value="auto",
label="Document type",
info="'auto' detects from filename. Override if needed."
)
run_btn = gr.Button("▶ Run Pre-Labeling", variant="primary", size="lg")
with gr.Column(scale=2):
summary_out = gr.Markdown(value="*Results will appear here after running.*")
gr.Markdown("### 📥 JSON output")
json_out = gr.Code(label="Pre-label JSON", language="json", lines=20)
run_btn.click(fn=run_prelabel, inputs=[pdf_input, doc_type_input], outputs=[summary_out, json_out])
with gr.Tab("📊 Score Against Gold"):
gr.Markdown("""
Paste your pre-label JSON and gold JSON to compute Precision / Recall / F1.
**Gold JSON format:**
```json
[{"label": "Text", "bbox": {"x0":72,"y0":110,"x1":540,"y1":200}, "page": 1}]
```
""")
with gr.Row():
pred_input = gr.Code(label="Predictions JSON", language="json", lines=12)
gold_input = gr.Code(label="Gold JSON", language="json", lines=12)
score_btn = gr.Button("📊 Compute Metrics", variant="primary")
metrics_out = gr.Markdown()
score_btn.click(fn=score_gold, inputs=[pred_input, gold_input], outputs=metrics_out)
with gr.Tab("📖 Label Guide"):
gr.Markdown("""
## Label schema
| Label | Description |
|---|---|
| **Text** | Body text, paragraphs, bullet points, list items |
| **Equation** | Standalone math formulas (inline equations in math/research docs are ignored) |
| **Title** | Document or article title |
| **Section Header** | Section/subsection headings |
| **Table** | Data tables (not captions) |
| **Caption** | Figure/table captions — starts with Fig., Table., etc. |
| **Figure** | Images, charts, diagrams, illustrations |
| **Page Number** | Standalone page numbers |
| **Footer** | Copyright lines, source notes at bottom |
| **Page-footer** | Running footer with page number / journal reference |
## Guidelines applied
- **Math & Research:** inline equations inside text are not labeled — only standalone equations
- **Newspapers:** multiple paragraphs in the same column/section can share one bounding box
- **Footer vs Page-footer:** these are two distinct labels, never merged
""")
demo.launch()