Spaces:

roger1024
/

DocPipe

Runtime error

File size: 5,494 Bytes

d423504

"""PyMuPDF-based text extraction for the mupdf (text-ok) backend.

This is the simplest of the three parser backends. It assumes the PDF
already has a clean text layer and just needs unwrapping into Markdown —
which is why the router routes here only when the XGBoost classifier says
``ocr_prob < threshold``.

We use ``page.get_text("blocks")`` which returns paragraph-shaped blocks
with coordinates already in reading order (PyMuPDF's internal sorting).
Each block becomes one :class:`pdfsys_core.Segment` of type
:attr:`pdfsys_core.RegionType.TEXT`, with its bbox normalized to ``[0, 1]``.
Empty and image-only blocks are dropped.

No layout-model dependency, no GPU, no OCR — this is the text-ok fast
path, and stays that way.
"""

from __future__ import annotations

import hashlib
import io
from pathlib import Path
from typing import Any

import pymupdf

from pdfsys_core import (
    Backend,
    BBox,
    ExtractedDoc,
    RegionType,
    Segment,
    merge_segments_to_markdown,
)


# PyMuPDF block tuple layout: (x0, y0, x1, y1, text, block_no, block_type).
# block_type 0 = text, 1 = image.
_TEXT_BLOCK_TYPE = 0


def _sha256_of_file(path: Path) -> str:
    h = hashlib.sha256()
    with path.open("rb") as f:
        for chunk in iter(lambda: f.read(1 << 20), b""):
            h.update(chunk)
    return h.hexdigest()


def _sha256_of_bytes(data: bytes) -> str:
    return hashlib.sha256(data).hexdigest()


def _normalize_text(text: str) -> str:
    """Trim trailing whitespace and collapse PyMuPDF's soft linebreaks.

    PyMuPDF returns block text with intra-paragraph newlines. For Markdown
    emission we keep paragraphs on one line; actual paragraph breaks come
    from the block boundaries themselves.
    """
    if not text:
        return ""
    # Strip and replace single newlines with spaces while preserving
    # double-newlines (rare, but occasionally emitted for list items).
    paragraphs = [p.strip() for p in text.split("\n\n")]
    joined = "\n\n".join(" ".join(p.split()) for p in paragraphs if p.strip())
    return joined.strip()


def _block_bbox(
    block: tuple[Any, ...],
    page_width_pt: float,
    page_height_pt: float,
) -> BBox | None:
    """Normalize a PyMuPDF block bbox to ``[0, 1]`` or return None on overflow."""
    x0, y0, x1, y1 = block[0], block[1], block[2], block[3]
    if page_width_pt <= 0 or page_height_pt <= 0:
        return None

    def clamp(v: float) -> float:
        if v < 0.0:
            return 0.0
        if v > 1.0:
            return 1.0
        return v

    nx0 = clamp(x0 / page_width_pt)
    ny0 = clamp(y0 / page_height_pt)
    nx1 = clamp(x1 / page_width_pt)
    ny1 = clamp(y1 / page_height_pt)
    if nx1 <= nx0 or ny1 <= ny0:
        return None
    try:
        return BBox(x0=nx0, y0=ny0, x1=nx1, y1=ny1)
    except ValueError:
        return None


def extract_doc(pdf_path: str | Path) -> ExtractedDoc:
    """Run the mupdf backend on a single PDF file and return its ExtractedDoc."""
    path = Path(pdf_path)
    sha256 = _sha256_of_file(path)
    doc = pymupdf.open(str(path))
    try:
        return _extract(doc, sha256)
    finally:
        doc.close()


def extract_doc_bytes(pdf_bytes: bytes, sha256: str | None = None) -> ExtractedDoc:
    """Run the mupdf backend on an in-memory PDF buffer."""
    sha = sha256 or _sha256_of_bytes(pdf_bytes)
    doc = pymupdf.open(stream=io.BytesIO(pdf_bytes), filetype="pdf")
    try:
        return _extract(doc, sha)
    finally:
        doc.close()


def _extract(doc: pymupdf.Document, sha256: str) -> ExtractedDoc:
    segments: list[Segment] = []
    pages_extracted = 0
    pages_skipped = 0

    for page_index, page in enumerate(doc):
        page_width_pt = float(page.rect.width)
        page_height_pt = float(page.rect.height)

        try:
            blocks = page.get_text(
                "blocks",
                flags=pymupdf.TEXT_PRESERVE_WHITESPACE | pymupdf.TEXT_MEDIABOX_CLIP,
                sort=True,
            )
        except Exception:
            pages_skipped += 1
            continue

        pages_extracted += 1
        for block in blocks:
            # block tuple: (x0, y0, x1, y1, text, block_no, block_type)
            if len(block) < 7:
                continue
            if block[6] != _TEXT_BLOCK_TYPE:
                # image block — mupdf backend doesn't emit IMAGE segments by
                # design; image-heavy PDFs should have been routed elsewhere.
                continue
            text = _normalize_text(block[4] or "")
            if not text:
                continue
            bbox = _block_bbox(block, page_width_pt, page_height_pt)
            segments.append(
                Segment(
                    index=len(segments),
                    backend=Backend.MUPDF,
                    page_index=page_index,
                    type=RegionType.TEXT,
                    content=text,
                    bbox=bbox,
                    source_region_id=None,
                )
            )

    seg_tuple = tuple(segments)
    markdown = merge_segments_to_markdown(seg_tuple)

    stats: dict[str, Any] = {
        "page_count": len(doc),
        "pages_extracted": pages_extracted,
        "pages_skipped": pages_skipped,
        "segment_count": len(seg_tuple),
        "char_count": len(markdown),
    }

    return ExtractedDoc(
        sha256=sha256,
        backend=Backend.MUPDF,
        segments=seg_tuple,
        markdown=markdown,
        stats=stats,
    )