Spaces:
Sleeping
Sleeping
| """Parallel PDF-to-markdown extraction with Modal. | |
| Usage: | |
| 1. Authenticate Modal once: | |
| uv run modal token new | |
| 2. Run a batch extraction over cached PDFs: | |
| uv run modal run scripts/modal_pdf_markdown.py::extract_batch --input-manifest artifacts/pdf_manifest.json --output artifacts/pdf_markdown.json | |
| The manifest format is a JSON array of objects like: | |
| [{"forum_id": "abc123", "pdf_path": "artifacts/openreview_pdfs/abc123.pdf"}] | |
| """ | |
| from __future__ import annotations | |
| import json | |
| import tempfile | |
| from pathlib import Path | |
| import modal | |
| app = modal.App("peer-review-pdf-markdown") | |
| image = ( | |
| modal.Image.debian_slim(python_version="3.11") | |
| .apt_install("tesseract-ocr") | |
| .pip_install("pymupdf4llm", "pymupdf") | |
| ) | |
| def extract_pdf_markdown(item: dict[str, str]) -> dict[str, str]: | |
| import pymupdf4llm | |
| pdf_path = item["pdf_path"] | |
| forum_id = item["forum_id"] | |
| pdf_bytes = Path(pdf_path).read_bytes() | |
| with tempfile.NamedTemporaryFile(suffix=".pdf", delete=True) as handle: | |
| handle.write(pdf_bytes) | |
| handle.flush() | |
| markdown = pymupdf4llm.to_markdown(handle.name) | |
| return {"forum_id": forum_id, "paper_markdown": markdown} | |
| def extract_batch(input_manifest: str, output: str = "artifacts/pdf_markdown.json"): | |
| manifest = json.loads(Path(input_manifest).read_text(encoding="utf-8")) | |
| results = list(extract_pdf_markdown.map(manifest, order_outputs=True)) | |
| output_path = Path(output) | |
| output_path.parent.mkdir(parents=True, exist_ok=True) | |
| output_path.write_text(json.dumps(results, indent=2), encoding="utf-8") | |
| print(f"Wrote {output_path}") | |