File size: 5,241 Bytes
d423504 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 | """OCR quality scorer backed by the FinePDFs ModernBERT classifier.
Wraps ``HuggingFaceFW/finepdfs_ocr_quality_classifier_eng_Latn`` — a
single-head regression fine-tune of ModernBERT-large (~0.4 B params)
that emits a float in ``[0, 3]`` where:
* 0 → garbage / unreadable OCR
* 1 → formatting issues but mostly readable
* 2 → minor problems
* 3 → clean text
The scorer takes raw extracted text (Markdown or plain), truncates to at
most ``max_chars`` characters before tokenization, tokenizes with the
model's own tokenizer, runs one forward pass, and returns the scalar.
Heavy dependencies (``torch`` + ``transformers``) are imported lazily so
that merely importing :mod:`pdfsys_bench` does not pull them in.
"""
from __future__ import annotations
from dataclasses import dataclass
from typing import Any
DEFAULT_MODEL = "HuggingFaceFW/finepdfs_ocr_quality_classifier_eng_Latn"
DEFAULT_MAX_CHARS = 10_000
# Upstream FinePDFs uses max_tokens=2048, but ModernBERT-large activations
# at that length need ≈ 3 GB of RAM — too much for a 4 GB dev box. 512
# tokens is enough to give a stable quality signal in practice and keeps
# peak memory well under a gig.
DEFAULT_MAX_TOKENS = 512
@dataclass(slots=True)
class QualityScore:
"""Result of scoring one document."""
score: float
num_chars: int
num_tokens: int
model: str
def as_record(self) -> dict[str, Any]:
return {
"quality_score": self.score,
"quality_num_chars": self.num_chars,
"quality_num_tokens": self.num_tokens,
"quality_model": self.model,
}
class OcrQualityScorer:
"""Lazy ModernBERT regression scorer. Re-uses model/tokenizer across calls."""
def __init__(
self,
model_name: str = DEFAULT_MODEL,
max_chars: int = DEFAULT_MAX_CHARS,
max_tokens: int = DEFAULT_MAX_TOKENS,
device: str | None = None,
dtype: str = "bfloat16",
) -> None:
self.model_name = model_name
self.max_chars = max_chars
self.max_tokens = max_tokens
self._device_name = device
self.dtype_name = dtype
self._tokenizer: Any = None
self._model: Any = None
self._torch: Any = None
self._device: Any = None
def _ensure_loaded(self) -> None:
if self._model is not None:
return
import torch # noqa: PLC0415 — lazy import is intentional
from transformers import AutoModelForSequenceClassification, AutoTokenizer # noqa: PLC0415
self._torch = torch
self._device = torch.device(
self._device_name
or ("cuda" if torch.cuda.is_available() else "cpu")
)
# Use bfloat16 on CPU to halve the model's memory footprint —
# ModernBERT-large is ~0.4 B params, so fp32 weights alone take
# ~1.6 GB and OOM a 4 GB-RAM dev box. bf16 inference is
# numerically stable enough for a regression head like this.
torch_dtype = getattr(torch, self.dtype_name, torch.float32)
self._tokenizer = AutoTokenizer.from_pretrained(self.model_name)
# ``dtype`` is the transformers≥5 name; ``torch_dtype`` was the
# transformers<5 name. Pass ``dtype`` and fall back for older releases.
try:
model = AutoModelForSequenceClassification.from_pretrained(
self.model_name,
dtype=torch_dtype,
)
except TypeError:
model = AutoModelForSequenceClassification.from_pretrained(
self.model_name,
torch_dtype=torch_dtype,
)
model.eval()
model.to(self._device)
self._model = model
def score(self, text: str) -> QualityScore:
"""Score a single document. Empty input returns 0.0."""
if not text or not text.strip():
return QualityScore(
score=0.0, num_chars=0, num_tokens=0, model=self.model_name
)
self._ensure_loaded()
assert self._tokenizer is not None and self._model is not None
torch = self._torch
clipped = text[: self.max_chars]
enc = self._tokenizer(
clipped,
return_tensors="pt",
truncation=True,
max_length=self.max_tokens,
)
num_tokens = int(enc["input_ids"].shape[1])
enc = {k: v.to(self._device) for k, v in enc.items()}
with torch.inference_mode():
out = self._model(**enc)
logits = out.logits # shape [1, 1] for regression
raw = float(logits.squeeze().item())
# Drop the forward-pass tensors eagerly so large-seq runs on CPU
# don't hold onto activations between calls.
del enc, out, logits
# Clamp to the documented [0, 3] range.
clamped = max(0.0, min(3.0, raw))
return QualityScore(
score=clamped,
num_chars=len(clipped),
num_tokens=num_tokens,
model=self.model_name,
)
def score_many(self, texts: list[str]) -> list[QualityScore]:
"""Serial scoring — tiny MVP harness, not a batched hot path."""
return [self.score(t) for t in texts]
|