peer-review-env / scripts /modal_pdf_markdown.py
Samarth0710's picture
Update 5-paper compliant benchmark and inference
f71c9be verified
"""Parallel PDF-to-markdown extraction with Modal.
Usage:
1. Authenticate Modal once:
uv run modal token new
2. Run a batch extraction over cached PDFs:
uv run modal run scripts/modal_pdf_markdown.py::extract_batch --input-manifest artifacts/pdf_manifest.json --output artifacts/pdf_markdown.json
The manifest format is a JSON array of objects like:
[{"forum_id": "abc123", "pdf_path": "artifacts/openreview_pdfs/abc123.pdf"}]
"""
from __future__ import annotations
import json
import tempfile
from pathlib import Path
import modal
app = modal.App("peer-review-pdf-markdown")
image = (
modal.Image.debian_slim(python_version="3.11")
.apt_install("tesseract-ocr")
.pip_install("pymupdf4llm", "pymupdf")
)
@app.function(image=image, cpu=2, memory=4096, timeout=1800)
def extract_pdf_markdown(item: dict[str, str]) -> dict[str, str]:
import pymupdf4llm
pdf_path = item["pdf_path"]
forum_id = item["forum_id"]
pdf_bytes = Path(pdf_path).read_bytes()
with tempfile.NamedTemporaryFile(suffix=".pdf", delete=True) as handle:
handle.write(pdf_bytes)
handle.flush()
markdown = pymupdf4llm.to_markdown(handle.name)
return {"forum_id": forum_id, "paper_markdown": markdown}
@app.local_entrypoint()
def extract_batch(input_manifest: str, output: str = "artifacts/pdf_markdown.json"):
manifest = json.loads(Path(input_manifest).read_text(encoding="utf-8"))
results = list(extract_pdf_markdown.map(manifest, order_outputs=True))
output_path = Path(output)
output_path.parent.mkdir(parents=True, exist_ok=True)
output_path.write_text(json.dumps(results, indent=2), encoding="utf-8")
print(f"Wrote {output_path}")