Spaces:

roger1024
/

DocPipe

Runtime error

DocPipe / packages /pdfsys-bench /src /pdfsys_bench /quality.py

yin

feat(mvp): wire router → mupdf parser → OCR quality scorer closed loop

d423504 about 1 month ago

5.24 kB

	"""OCR quality scorer backed by the FinePDFs ModernBERT classifier.

	Wraps ``HuggingFaceFW/finepdfs_ocr_quality_classifier_eng_Latn`` — a
	single-head regression fine-tune of ModernBERT-large (~0.4 B params)
	that emits a float in ``[0, 3]`` where:

	* 0 → garbage / unreadable OCR
	* 1 → formatting issues but mostly readable
	* 2 → minor problems
	* 3 → clean text

	The scorer takes raw extracted text (Markdown or plain), truncates to at
	most ``max_chars`` characters before tokenization, tokenizes with the
	model's own tokenizer, runs one forward pass, and returns the scalar.

	Heavy dependencies (``torch`` + ``transformers``) are imported lazily so
	that merely importing :mod:`pdfsys_bench` does not pull them in.
	"""

	from __future__ import annotations

	from dataclasses import dataclass
	from typing import Any

	DEFAULT_MODEL = "HuggingFaceFW/finepdfs_ocr_quality_classifier_eng_Latn"
	DEFAULT_MAX_CHARS = 10_000
	# Upstream FinePDFs uses max_tokens=2048, but ModernBERT-large activations
	# at that length need ≈ 3 GB of RAM — too much for a 4 GB dev box. 512
	# tokens is enough to give a stable quality signal in practice and keeps
	# peak memory well under a gig.
	DEFAULT_MAX_TOKENS = 512


	@dataclass(slots=True)
	class QualityScore:
	"""Result of scoring one document."""

	score: float
	num_chars: int
	num_tokens: int
	model: str

	def as_record(self) -> dict[str, Any]:
	return {
	"quality_score": self.score,
	"quality_num_chars": self.num_chars,
	"quality_num_tokens": self.num_tokens,
	"quality_model": self.model,
	}


	class OcrQualityScorer:
	"""Lazy ModernBERT regression scorer. Re-uses model/tokenizer across calls."""

	def __init__(
	self,
	model_name: str = DEFAULT_MODEL,
	max_chars: int = DEFAULT_MAX_CHARS,
	max_tokens: int = DEFAULT_MAX_TOKENS,
	device: str \| None = None,
	dtype: str = "bfloat16",
	) -> None:
	self.model_name = model_name
	self.max_chars = max_chars
	self.max_tokens = max_tokens
	self._device_name = device
	self.dtype_name = dtype
	self._tokenizer: Any = None
	self._model: Any = None
	self._torch: Any = None
	self._device: Any = None

	def _ensure_loaded(self) -> None:
	if self._model is not None:
	return
	import torch # noqa: PLC0415 — lazy import is intentional
	from transformers import AutoModelForSequenceClassification, AutoTokenizer # noqa: PLC0415

	self._torch = torch
	self._device = torch.device(
	self._device_name
	or ("cuda" if torch.cuda.is_available() else "cpu")
	)
	# Use bfloat16 on CPU to halve the model's memory footprint —
	# ModernBERT-large is ~0.4 B params, so fp32 weights alone take
	# ~1.6 GB and OOM a 4 GB-RAM dev box. bf16 inference is
	# numerically stable enough for a regression head like this.
	torch_dtype = getattr(torch, self.dtype_name, torch.float32)

	self._tokenizer = AutoTokenizer.from_pretrained(self.model_name)
	# ``dtype`` is the transformers≥5 name; ``torch_dtype`` was the
	# transformers<5 name. Pass ``dtype`` and fall back for older releases.
	try:
	model = AutoModelForSequenceClassification.from_pretrained(
	self.model_name,
	dtype=torch_dtype,
	)
	except TypeError:
	model = AutoModelForSequenceClassification.from_pretrained(
	self.model_name,
	torch_dtype=torch_dtype,
	)
	model.eval()
	model.to(self._device)
	self._model = model

	def score(self, text: str) -> QualityScore:
	"""Score a single document. Empty input returns 0.0."""
	if not text or not text.strip():
	return QualityScore(
	score=0.0, num_chars=0, num_tokens=0, model=self.model_name
	)

	self._ensure_loaded()
	assert self._tokenizer is not None and self._model is not None
	torch = self._torch

	clipped = text[: self.max_chars]
	enc = self._tokenizer(
	clipped,
	return_tensors="pt",
	truncation=True,
	max_length=self.max_tokens,
	)
	num_tokens = int(enc["input_ids"].shape[1])
	enc = {k: v.to(self._device) for k, v in enc.items()}

	with torch.inference_mode():
	out = self._model(**enc)
	logits = out.logits # shape [1, 1] for regression
	raw = float(logits.squeeze().item())
	# Drop the forward-pass tensors eagerly so large-seq runs on CPU
	# don't hold onto activations between calls.
	del enc, out, logits

	# Clamp to the documented [0, 3] range.
	clamped = max(0.0, min(3.0, raw))

	return QualityScore(
	score=clamped,
	num_chars=len(clipped),
	num_tokens=num_tokens,
	model=self.model_name,
	)

	def score_many(self, texts: list[str]) -> list[QualityScore]:
	"""Serial scoring — tiny MVP harness, not a batched hot path."""
	return [self.score(t) for t in texts]