Spaces:

Samarth0710
/

peer-review-env

Sleeping

App Files Files Community

peer-review-env / scripts /modal_pdf_markdown.py

Samarth0710

Update 5-paper compliant benchmark and inference

f71c9be verified about 2 months ago

raw

history blame contribute delete

1.7 kB

	"""Parallel PDF-to-markdown extraction with Modal.

	Usage:
	1. Authenticate Modal once:
	uv run modal token new

	2. Run a batch extraction over cached PDFs:
	uv run modal run scripts/modal_pdf_markdown.py::extract_batch --input-manifest artifacts/pdf_manifest.json --output artifacts/pdf_markdown.json

	The manifest format is a JSON array of objects like:
	[{"forum_id": "abc123", "pdf_path": "artifacts/openreview_pdfs/abc123.pdf"}]
	"""

	from __future__ import annotations

	import json
	import tempfile
	from pathlib import Path

	import modal


	app = modal.App("peer-review-pdf-markdown")

	image = (
	modal.Image.debian_slim(python_version="3.11")
	.apt_install("tesseract-ocr")
	.pip_install("pymupdf4llm", "pymupdf")
	)


	@app.function(image=image, cpu=2, memory=4096, timeout=1800)
	def extract_pdf_markdown(item: dict[str, str]) -> dict[str, str]:
	import pymupdf4llm

	pdf_path = item["pdf_path"]
	forum_id = item["forum_id"]
	pdf_bytes = Path(pdf_path).read_bytes()
	with tempfile.NamedTemporaryFile(suffix=".pdf", delete=True) as handle:
	handle.write(pdf_bytes)
	handle.flush()
	markdown = pymupdf4llm.to_markdown(handle.name)
	return {"forum_id": forum_id, "paper_markdown": markdown}


	@app.local_entrypoint()
	def extract_batch(input_manifest: str, output: str = "artifacts/pdf_markdown.json"):
	manifest = json.loads(Path(input_manifest).read_text(encoding="utf-8"))
	results = list(extract_pdf_markdown.map(manifest, order_outputs=True))
	output_path = Path(output)
	output_path.parent.mkdir(parents=True, exist_ok=True)
	output_path.write_text(json.dumps(results, indent=2), encoding="utf-8")
	print(f"Wrote {output_path}")