File size: 5,494 Bytes
d423504 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 | """PyMuPDF-based text extraction for the mupdf (text-ok) backend.
This is the simplest of the three parser backends. It assumes the PDF
already has a clean text layer and just needs unwrapping into Markdown —
which is why the router routes here only when the XGBoost classifier says
``ocr_prob < threshold``.
We use ``page.get_text("blocks")`` which returns paragraph-shaped blocks
with coordinates already in reading order (PyMuPDF's internal sorting).
Each block becomes one :class:`pdfsys_core.Segment` of type
:attr:`pdfsys_core.RegionType.TEXT`, with its bbox normalized to ``[0, 1]``.
Empty and image-only blocks are dropped.
No layout-model dependency, no GPU, no OCR — this is the text-ok fast
path, and stays that way.
"""
from __future__ import annotations
import hashlib
import io
from pathlib import Path
from typing import Any
import pymupdf
from pdfsys_core import (
Backend,
BBox,
ExtractedDoc,
RegionType,
Segment,
merge_segments_to_markdown,
)
# PyMuPDF block tuple layout: (x0, y0, x1, y1, text, block_no, block_type).
# block_type 0 = text, 1 = image.
_TEXT_BLOCK_TYPE = 0
def _sha256_of_file(path: Path) -> str:
h = hashlib.sha256()
with path.open("rb") as f:
for chunk in iter(lambda: f.read(1 << 20), b""):
h.update(chunk)
return h.hexdigest()
def _sha256_of_bytes(data: bytes) -> str:
return hashlib.sha256(data).hexdigest()
def _normalize_text(text: str) -> str:
"""Trim trailing whitespace and collapse PyMuPDF's soft linebreaks.
PyMuPDF returns block text with intra-paragraph newlines. For Markdown
emission we keep paragraphs on one line; actual paragraph breaks come
from the block boundaries themselves.
"""
if not text:
return ""
# Strip and replace single newlines with spaces while preserving
# double-newlines (rare, but occasionally emitted for list items).
paragraphs = [p.strip() for p in text.split("\n\n")]
joined = "\n\n".join(" ".join(p.split()) for p in paragraphs if p.strip())
return joined.strip()
def _block_bbox(
block: tuple[Any, ...],
page_width_pt: float,
page_height_pt: float,
) -> BBox | None:
"""Normalize a PyMuPDF block bbox to ``[0, 1]`` or return None on overflow."""
x0, y0, x1, y1 = block[0], block[1], block[2], block[3]
if page_width_pt <= 0 or page_height_pt <= 0:
return None
def clamp(v: float) -> float:
if v < 0.0:
return 0.0
if v > 1.0:
return 1.0
return v
nx0 = clamp(x0 / page_width_pt)
ny0 = clamp(y0 / page_height_pt)
nx1 = clamp(x1 / page_width_pt)
ny1 = clamp(y1 / page_height_pt)
if nx1 <= nx0 or ny1 <= ny0:
return None
try:
return BBox(x0=nx0, y0=ny0, x1=nx1, y1=ny1)
except ValueError:
return None
def extract_doc(pdf_path: str | Path) -> ExtractedDoc:
"""Run the mupdf backend on a single PDF file and return its ExtractedDoc."""
path = Path(pdf_path)
sha256 = _sha256_of_file(path)
doc = pymupdf.open(str(path))
try:
return _extract(doc, sha256)
finally:
doc.close()
def extract_doc_bytes(pdf_bytes: bytes, sha256: str | None = None) -> ExtractedDoc:
"""Run the mupdf backend on an in-memory PDF buffer."""
sha = sha256 or _sha256_of_bytes(pdf_bytes)
doc = pymupdf.open(stream=io.BytesIO(pdf_bytes), filetype="pdf")
try:
return _extract(doc, sha)
finally:
doc.close()
def _extract(doc: pymupdf.Document, sha256: str) -> ExtractedDoc:
segments: list[Segment] = []
pages_extracted = 0
pages_skipped = 0
for page_index, page in enumerate(doc):
page_width_pt = float(page.rect.width)
page_height_pt = float(page.rect.height)
try:
blocks = page.get_text(
"blocks",
flags=pymupdf.TEXT_PRESERVE_WHITESPACE | pymupdf.TEXT_MEDIABOX_CLIP,
sort=True,
)
except Exception:
pages_skipped += 1
continue
pages_extracted += 1
for block in blocks:
# block tuple: (x0, y0, x1, y1, text, block_no, block_type)
if len(block) < 7:
continue
if block[6] != _TEXT_BLOCK_TYPE:
# image block — mupdf backend doesn't emit IMAGE segments by
# design; image-heavy PDFs should have been routed elsewhere.
continue
text = _normalize_text(block[4] or "")
if not text:
continue
bbox = _block_bbox(block, page_width_pt, page_height_pt)
segments.append(
Segment(
index=len(segments),
backend=Backend.MUPDF,
page_index=page_index,
type=RegionType.TEXT,
content=text,
bbox=bbox,
source_region_id=None,
)
)
seg_tuple = tuple(segments)
markdown = merge_segments_to_markdown(seg_tuple)
stats: dict[str, Any] = {
"page_count": len(doc),
"pages_extracted": pages_extracted,
"pages_skipped": pages_skipped,
"segment_count": len(seg_tuple),
"char_count": len(markdown),
}
return ExtractedDoc(
sha256=sha256,
backend=Backend.MUPDF,
segments=seg_tuple,
markdown=markdown,
stats=stats,
)
|