Spaces:

roger1024
/

DocPipe

Sleeping

DocPipe / packages /pdfsys-parser-mupdf /src /pdfsys_parser_mupdf /extract.py

yin

feat(mvp): wire router → mupdf parser → OCR quality scorer closed loop

d423504 about 1 month ago

5.49 kB

	"""PyMuPDF-based text extraction for the mupdf (text-ok) backend.

	This is the simplest of the three parser backends. It assumes the PDF
	already has a clean text layer and just needs unwrapping into Markdown —
	which is why the router routes here only when the XGBoost classifier says
	``ocr_prob < threshold``.

	We use ``page.get_text("blocks")`` which returns paragraph-shaped blocks
	with coordinates already in reading order (PyMuPDF's internal sorting).
	Each block becomes one :class:`pdfsys_core.Segment` of type
	:attr:`pdfsys_core.RegionType.TEXT`, with its bbox normalized to ``[0, 1]``.
	Empty and image-only blocks are dropped.

	No layout-model dependency, no GPU, no OCR — this is the text-ok fast
	path, and stays that way.
	"""

	from __future__ import annotations

	import hashlib
	import io
	from pathlib import Path
	from typing import Any

	import pymupdf

	from pdfsys_core import (
	Backend,
	BBox,
	ExtractedDoc,
	RegionType,
	Segment,
	merge_segments_to_markdown,
	)


	# PyMuPDF block tuple layout: (x0, y0, x1, y1, text, block_no, block_type).
	# block_type 0 = text, 1 = image.
	_TEXT_BLOCK_TYPE = 0


	def _sha256_of_file(path: Path) -> str:
	h = hashlib.sha256()
	with path.open("rb") as f:
	for chunk in iter(lambda: f.read(1 << 20), b""):
	h.update(chunk)
	return h.hexdigest()


	def _sha256_of_bytes(data: bytes) -> str:
	return hashlib.sha256(data).hexdigest()


	def _normalize_text(text: str) -> str:
	"""Trim trailing whitespace and collapse PyMuPDF's soft linebreaks.

	PyMuPDF returns block text with intra-paragraph newlines. For Markdown
	emission we keep paragraphs on one line; actual paragraph breaks come
	from the block boundaries themselves.
	"""
	if not text:
	return ""
	# Strip and replace single newlines with spaces while preserving
	# double-newlines (rare, but occasionally emitted for list items).
	paragraphs = [p.strip() for p in text.split("\n\n")]
	joined = "\n\n".join(" ".join(p.split()) for p in paragraphs if p.strip())
	return joined.strip()


	def _block_bbox(
	block: tuple[Any, ...],
	page_width_pt: float,
	page_height_pt: float,
	) -> BBox \| None:
	"""Normalize a PyMuPDF block bbox to ``[0, 1]`` or return None on overflow."""
	x0, y0, x1, y1 = block[0], block[1], block[2], block[3]
	if page_width_pt <= 0 or page_height_pt <= 0:
	return None

	def clamp(v: float) -> float:
	if v < 0.0:
	return 0.0
	if v > 1.0:
	return 1.0
	return v

	nx0 = clamp(x0 / page_width_pt)
	ny0 = clamp(y0 / page_height_pt)
	nx1 = clamp(x1 / page_width_pt)
	ny1 = clamp(y1 / page_height_pt)
	if nx1 <= nx0 or ny1 <= ny0:
	return None
	try:
	return BBox(x0=nx0, y0=ny0, x1=nx1, y1=ny1)
	except ValueError:
	return None


	def extract_doc(pdf_path: str \| Path) -> ExtractedDoc:
	"""Run the mupdf backend on a single PDF file and return its ExtractedDoc."""
	path = Path(pdf_path)
	sha256 = _sha256_of_file(path)
	doc = pymupdf.open(str(path))
	try:
	return _extract(doc, sha256)
	finally:
	doc.close()


	def extract_doc_bytes(pdf_bytes: bytes, sha256: str \| None = None) -> ExtractedDoc:
	"""Run the mupdf backend on an in-memory PDF buffer."""
	sha = sha256 or _sha256_of_bytes(pdf_bytes)
	doc = pymupdf.open(stream=io.BytesIO(pdf_bytes), filetype="pdf")
	try:
	return _extract(doc, sha)
	finally:
	doc.close()


	def _extract(doc: pymupdf.Document, sha256: str) -> ExtractedDoc:
	segments: list[Segment] = []
	pages_extracted = 0
	pages_skipped = 0

	for page_index, page in enumerate(doc):
	page_width_pt = float(page.rect.width)
	page_height_pt = float(page.rect.height)

	try:
	blocks = page.get_text(
	"blocks",
	flags=pymupdf.TEXT_PRESERVE_WHITESPACE \| pymupdf.TEXT_MEDIABOX_CLIP,
	sort=True,
	)
	except Exception:
	pages_skipped += 1
	continue

	pages_extracted += 1
	for block in blocks:
	# block tuple: (x0, y0, x1, y1, text, block_no, block_type)
	if len(block) < 7:
	continue
	if block[6] != _TEXT_BLOCK_TYPE:
	# image block — mupdf backend doesn't emit IMAGE segments by
	# design; image-heavy PDFs should have been routed elsewhere.
	continue
	text = _normalize_text(block[4] or "")
	if not text:
	continue
	bbox = _block_bbox(block, page_width_pt, page_height_pt)
	segments.append(
	Segment(
	index=len(segments),
	backend=Backend.MUPDF,
	page_index=page_index,
	type=RegionType.TEXT,
	content=text,
	bbox=bbox,
	source_region_id=None,
	)
	)

	seg_tuple = tuple(segments)
	markdown = merge_segments_to_markdown(seg_tuple)

	stats: dict[str, Any] = {
	"page_count": len(doc),
	"pages_extracted": pages_extracted,
	"pages_skipped": pages_skipped,
	"segment_count": len(seg_tuple),
	"char_count": len(markdown),
	}

	return ExtractedDoc(
	sha256=sha256,
	backend=Backend.MUPDF,
	segments=seg_tuple,
	markdown=markdown,
	stats=stats,
	)