| """PyMuPDF-based text extraction for the mupdf (text-ok) backend. |
| |
| This is the simplest of the three parser backends. It assumes the PDF |
| already has a clean text layer and just needs unwrapping into Markdown β |
| which is why the router routes here only when the XGBoost classifier says |
| ``ocr_prob < threshold``. |
| |
| We use ``page.get_text("blocks")`` which returns paragraph-shaped blocks |
| with coordinates already in reading order (PyMuPDF's internal sorting). |
| Each block becomes one :class:`pdfsys_core.Segment` of type |
| :attr:`pdfsys_core.RegionType.TEXT`, with its bbox normalized to ``[0, 1]``. |
| Empty and image-only blocks are dropped. |
| |
| No layout-model dependency, no GPU, no OCR β this is the text-ok fast |
| path, and stays that way. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import hashlib |
| import io |
| from pathlib import Path |
| from typing import Any |
|
|
| import pymupdf |
|
|
| from pdfsys_core import ( |
| Backend, |
| BBox, |
| ExtractedDoc, |
| RegionType, |
| Segment, |
| merge_segments_to_markdown, |
| ) |
|
|
|
|
| |
| |
| _TEXT_BLOCK_TYPE = 0 |
|
|
|
|
| def _sha256_of_file(path: Path) -> str: |
| h = hashlib.sha256() |
| with path.open("rb") as f: |
| for chunk in iter(lambda: f.read(1 << 20), b""): |
| h.update(chunk) |
| return h.hexdigest() |
|
|
|
|
| def _sha256_of_bytes(data: bytes) -> str: |
| return hashlib.sha256(data).hexdigest() |
|
|
|
|
| def _normalize_text(text: str) -> str: |
| """Trim trailing whitespace and collapse PyMuPDF's soft linebreaks. |
| |
| PyMuPDF returns block text with intra-paragraph newlines. For Markdown |
| emission we keep paragraphs on one line; actual paragraph breaks come |
| from the block boundaries themselves. |
| """ |
| if not text: |
| return "" |
| |
| |
| paragraphs = [p.strip() for p in text.split("\n\n")] |
| joined = "\n\n".join(" ".join(p.split()) for p in paragraphs if p.strip()) |
| return joined.strip() |
|
|
|
|
| def _block_bbox( |
| block: tuple[Any, ...], |
| page_width_pt: float, |
| page_height_pt: float, |
| ) -> BBox | None: |
| """Normalize a PyMuPDF block bbox to ``[0, 1]`` or return None on overflow.""" |
| x0, y0, x1, y1 = block[0], block[1], block[2], block[3] |
| if page_width_pt <= 0 or page_height_pt <= 0: |
| return None |
|
|
| def clamp(v: float) -> float: |
| if v < 0.0: |
| return 0.0 |
| if v > 1.0: |
| return 1.0 |
| return v |
|
|
| nx0 = clamp(x0 / page_width_pt) |
| ny0 = clamp(y0 / page_height_pt) |
| nx1 = clamp(x1 / page_width_pt) |
| ny1 = clamp(y1 / page_height_pt) |
| if nx1 <= nx0 or ny1 <= ny0: |
| return None |
| try: |
| return BBox(x0=nx0, y0=ny0, x1=nx1, y1=ny1) |
| except ValueError: |
| return None |
|
|
|
|
| def extract_doc(pdf_path: str | Path) -> ExtractedDoc: |
| """Run the mupdf backend on a single PDF file and return its ExtractedDoc.""" |
| path = Path(pdf_path) |
| sha256 = _sha256_of_file(path) |
| doc = pymupdf.open(str(path)) |
| try: |
| return _extract(doc, sha256) |
| finally: |
| doc.close() |
|
|
|
|
| def extract_doc_bytes(pdf_bytes: bytes, sha256: str | None = None) -> ExtractedDoc: |
| """Run the mupdf backend on an in-memory PDF buffer.""" |
| sha = sha256 or _sha256_of_bytes(pdf_bytes) |
| doc = pymupdf.open(stream=io.BytesIO(pdf_bytes), filetype="pdf") |
| try: |
| return _extract(doc, sha) |
| finally: |
| doc.close() |
|
|
|
|
| def _extract(doc: pymupdf.Document, sha256: str) -> ExtractedDoc: |
| segments: list[Segment] = [] |
| pages_extracted = 0 |
| pages_skipped = 0 |
|
|
| for page_index, page in enumerate(doc): |
| page_width_pt = float(page.rect.width) |
| page_height_pt = float(page.rect.height) |
|
|
| try: |
| blocks = page.get_text( |
| "blocks", |
| flags=pymupdf.TEXT_PRESERVE_WHITESPACE | pymupdf.TEXT_MEDIABOX_CLIP, |
| sort=True, |
| ) |
| except Exception: |
| pages_skipped += 1 |
| continue |
|
|
| pages_extracted += 1 |
| for block in blocks: |
| |
| if len(block) < 7: |
| continue |
| if block[6] != _TEXT_BLOCK_TYPE: |
| |
| |
| continue |
| text = _normalize_text(block[4] or "") |
| if not text: |
| continue |
| bbox = _block_bbox(block, page_width_pt, page_height_pt) |
| segments.append( |
| Segment( |
| index=len(segments), |
| backend=Backend.MUPDF, |
| page_index=page_index, |
| type=RegionType.TEXT, |
| content=text, |
| bbox=bbox, |
| source_region_id=None, |
| ) |
| ) |
|
|
| seg_tuple = tuple(segments) |
| markdown = merge_segments_to_markdown(seg_tuple) |
|
|
| stats: dict[str, Any] = { |
| "page_count": len(doc), |
| "pages_extracted": pages_extracted, |
| "pages_skipped": pages_skipped, |
| "segment_count": len(seg_tuple), |
| "char_count": len(markdown), |
| } |
|
|
| return ExtractedDoc( |
| sha256=sha256, |
| backend=Backend.MUPDF, |
| segments=seg_tuple, |
| markdown=markdown, |
| stats=stats, |
| ) |
|
|