File size: 5,241 Bytes
d423504
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
"""OCR quality scorer backed by the FinePDFs ModernBERT classifier.

Wraps ``HuggingFaceFW/finepdfs_ocr_quality_classifier_eng_Latn`` — a
single-head regression fine-tune of ModernBERT-large (~0.4 B params)
that emits a float in ``[0, 3]`` where:

* 0 → garbage / unreadable OCR
* 1 → formatting issues but mostly readable
* 2 → minor problems
* 3 → clean text

The scorer takes raw extracted text (Markdown or plain), truncates to at
most ``max_chars`` characters before tokenization, tokenizes with the
model's own tokenizer, runs one forward pass, and returns the scalar.

Heavy dependencies (``torch`` + ``transformers``) are imported lazily so
that merely importing :mod:`pdfsys_bench` does not pull them in.
"""

from __future__ import annotations

from dataclasses import dataclass
from typing import Any

DEFAULT_MODEL = "HuggingFaceFW/finepdfs_ocr_quality_classifier_eng_Latn"
DEFAULT_MAX_CHARS = 10_000
# Upstream FinePDFs uses max_tokens=2048, but ModernBERT-large activations
# at that length need ≈ 3 GB of RAM — too much for a 4 GB dev box. 512
# tokens is enough to give a stable quality signal in practice and keeps
# peak memory well under a gig.
DEFAULT_MAX_TOKENS = 512


@dataclass(slots=True)
class QualityScore:
    """Result of scoring one document."""

    score: float
    num_chars: int
    num_tokens: int
    model: str

    def as_record(self) -> dict[str, Any]:
        return {
            "quality_score": self.score,
            "quality_num_chars": self.num_chars,
            "quality_num_tokens": self.num_tokens,
            "quality_model": self.model,
        }


class OcrQualityScorer:
    """Lazy ModernBERT regression scorer. Re-uses model/tokenizer across calls."""

    def __init__(
        self,
        model_name: str = DEFAULT_MODEL,
        max_chars: int = DEFAULT_MAX_CHARS,
        max_tokens: int = DEFAULT_MAX_TOKENS,
        device: str | None = None,
        dtype: str = "bfloat16",
    ) -> None:
        self.model_name = model_name
        self.max_chars = max_chars
        self.max_tokens = max_tokens
        self._device_name = device
        self.dtype_name = dtype
        self._tokenizer: Any = None
        self._model: Any = None
        self._torch: Any = None
        self._device: Any = None

    def _ensure_loaded(self) -> None:
        if self._model is not None:
            return
        import torch  # noqa: PLC0415 — lazy import is intentional
        from transformers import AutoModelForSequenceClassification, AutoTokenizer  # noqa: PLC0415

        self._torch = torch
        self._device = torch.device(
            self._device_name
            or ("cuda" if torch.cuda.is_available() else "cpu")
        )
        # Use bfloat16 on CPU to halve the model's memory footprint —
        # ModernBERT-large is ~0.4 B params, so fp32 weights alone take
        # ~1.6 GB and OOM a 4 GB-RAM dev box. bf16 inference is
        # numerically stable enough for a regression head like this.
        torch_dtype = getattr(torch, self.dtype_name, torch.float32)

        self._tokenizer = AutoTokenizer.from_pretrained(self.model_name)
        # ``dtype`` is the transformers≥5 name; ``torch_dtype`` was the
        # transformers<5 name. Pass ``dtype`` and fall back for older releases.
        try:
            model = AutoModelForSequenceClassification.from_pretrained(
                self.model_name,
                dtype=torch_dtype,
            )
        except TypeError:
            model = AutoModelForSequenceClassification.from_pretrained(
                self.model_name,
                torch_dtype=torch_dtype,
            )
        model.eval()
        model.to(self._device)
        self._model = model

    def score(self, text: str) -> QualityScore:
        """Score a single document. Empty input returns 0.0."""
        if not text or not text.strip():
            return QualityScore(
                score=0.0, num_chars=0, num_tokens=0, model=self.model_name
            )

        self._ensure_loaded()
        assert self._tokenizer is not None and self._model is not None
        torch = self._torch

        clipped = text[: self.max_chars]
        enc = self._tokenizer(
            clipped,
            return_tensors="pt",
            truncation=True,
            max_length=self.max_tokens,
        )
        num_tokens = int(enc["input_ids"].shape[1])
        enc = {k: v.to(self._device) for k, v in enc.items()}

        with torch.inference_mode():
            out = self._model(**enc)
            logits = out.logits  # shape [1, 1] for regression
            raw = float(logits.squeeze().item())
        # Drop the forward-pass tensors eagerly so large-seq runs on CPU
        # don't hold onto activations between calls.
        del enc, out, logits

        # Clamp to the documented [0, 3] range.
        clamped = max(0.0, min(3.0, raw))

        return QualityScore(
            score=clamped,
            num_chars=len(clipped),
            num_tokens=num_tokens,
            model=self.model_name,
        )

    def score_many(self, texts: list[str]) -> list[QualityScore]:
        """Serial scoring — tiny MVP harness, not a batched hot path."""
        return [self.score(t) for t in texts]