Spaces:

brodowski7784
/

prelabel

Paused

App Files Files Community

prelabel / app.py

brodowski7784

Upload app.py

124725f verified about 1 month ago

raw

history blame contribute delete

5.95 kB

	import subprocess, sys
	subprocess.run([sys.executable, "-m", "pip", "install", "docling", "-q"], check=False)

	import json
	import sys
	from pathlib import Path

	import gradio as gr

	sys.path.insert(0, str(Path(__file__).parent / "pipeline"))
	from prelabeler import process_pdf, LABELS, LABEL_COLORS, detect_doc_type

	DOC_TYPES = ["auto", "math", "newspaper", "research", "presentation"]

	def run_prelabel(pdf_files, doc_type_choice):
	if not pdf_files:
	return "No files uploaded.", "{}"
	doc_type = None if doc_type_choice == "auto" else doc_type_choice
	all_results = []
	for pdf_path in pdf_files:
	result = process_pdf(pdf_path, doc_type)
	all_results.append(result)
	summary_lines = []
	for r in all_results:
	summary_lines.append(f"📄 {r['file']} ({r.get('doc_type','auto')}) — {r['pages']} page(s)")
	if r.get("error") and r["error"] != "docling_not_installed":
	summary_lines.append(f" ⚠️ Error: {r['error']}")
	else:
	if r.get("error") == "docling_not_installed":
	summary_lines.append(" ⚠️ Running in demo mode — Docling still installing, try again in 2 mins")
	counts = r.get("label_counts", {})
	for lbl in LABELS:
	if lbl in counts and counts[lbl] > 0:
	summary_lines.append(f" • {lbl}: {counts[lbl]} region(s)")
	summary_lines.append("")
	summary = "\n".join(summary_lines)
	json_out = json.dumps(all_results, indent=2)
	return summary, json_out


	def score_gold(pred_json_text, gold_json_text):
	try:
	preds = json.loads(pred_json_text)
	if isinstance(preds, list) and preds and "regions" in preds[0]:
	regions = []
	for r in preds: regions.extend(r.get("regions", []))
	else:
	regions = preds
	gold = json.loads(gold_json_text)
	if isinstance(gold, dict) and "regions" in gold:
	gold = gold["regions"]
	except Exception as e:
	return f"⚠️ JSON parse error: {e}"
	from prelabeler import score_against_gold
	m = score_against_gold(regions, gold)
	ov = m["overall"]
	lines = [
	f"## Overall: P={ov['precision']:.1%} R={ov['recall']:.1%} F1={ov['f1']:.1%}",
	"",
	"\| Label \| Precision \| Recall \| F1 \|",
	"\|---\|---\|---\|---\|",
	]
	for lbl in LABELS:
	d = m["per_label"].get(lbl, {})
	if d.get("f1", 0) > 0 or d.get("precision", 0) > 0:
	lines.append(f"\| {lbl} \| {d['precision']:.1%} \| {d['recall']:.1%} \| {d['f1']:.1%} \|")
	return "\n".join(lines)


	with gr.Blocks(
	title="PreLabel · Document Layout Annotator",
	theme=gr.themes.Soft(primary_hue="blue"),
	) as demo:

	gr.Markdown("""
	# 🏷️ PreLabel — Document Layout Pre-Labeling
	iMerit Annotation Pipeline · 10 label classes · Math · Newspaper · Presentation · Research

	Automatically pre-labels PDF regions as: `Text` · `Equation` · `Title` · `Section Header` · `Table` · `Caption` · `Figure` · `Page Number` · `Footer` · `Page-footer`
	""")

	with gr.Tabs():
	with gr.Tab("📄 Pre-Label PDFs"):
	with gr.Row():
	with gr.Column(scale=1):
	pdf_input = gr.File(
	label="Upload PDF files",
	file_types=[".pdf"],
	file_count="multiple",
	)
	doc_type_input = gr.Dropdown(
	choices=DOC_TYPES,
	value="auto",
	label="Document type",
	info="'auto' detects from filename. Override if needed."
	)
	run_btn = gr.Button("▶ Run Pre-Labeling", variant="primary", size="lg")
	with gr.Column(scale=2):
	summary_out = gr.Markdown(value="Results will appear here after running.")
	gr.Markdown("### 📥 JSON output")
	json_out = gr.Code(label="Pre-label JSON", language="json", lines=20)
	run_btn.click(fn=run_prelabel, inputs=[pdf_input, doc_type_input], outputs=[summary_out, json_out])

	with gr.Tab("📊 Score Against Gold"):
	gr.Markdown("""
	Paste your pre-label JSON and gold JSON to compute Precision / Recall / F1.

	Gold JSON format:
	```json
	[{"label": "Text", "bbox": {"x0":72,"y0":110,"x1":540,"y1":200}, "page": 1}]
	```
	""")
	with gr.Row():
	pred_input = gr.Code(label="Predictions JSON", language="json", lines=12)
	gold_input = gr.Code(label="Gold JSON", language="json", lines=12)
	score_btn = gr.Button("📊 Compute Metrics", variant="primary")
	metrics_out = gr.Markdown()
	score_btn.click(fn=score_gold, inputs=[pred_input, gold_input], outputs=metrics_out)

	with gr.Tab("📖 Label Guide"):
	gr.Markdown("""
	## Label schema

	\| Label \| Description \|
	\|---\|---\|
	\| Text \| Body text, paragraphs, bullet points, list items \|
	\| Equation \| Standalone math formulas (inline equations in math/research docs are ignored) \|
	\| Title \| Document or article title \|
	\| Section Header \| Section/subsection headings \|
	\| Table \| Data tables (not captions) \|
	\| Caption \| Figure/table captions — starts with Fig., Table., etc. \|
	\| Figure \| Images, charts, diagrams, illustrations \|
	\| Page Number \| Standalone page numbers \|
	\| Footer \| Copyright lines, source notes at bottom \|
	\| Page-footer \| Running footer with page number / journal reference \|

	## Guidelines applied
	- Math & Research: inline equations inside text are not labeled — only standalone equations
	- Newspapers: multiple paragraphs in the same column/section can share one bounding box
	- Footer vs Page-footer: these are two distinct labels, never merged
	""")

	demo.launch()