File size: 7,232 Bytes
d423504 67495fe d423504 67495fe d423504 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 | """Stage-A classifier: decides text-ok (MUPDF) vs needs-ocr (PIPELINE/VLM).
This is the single public entry point of the router for the MVP. Stage-B
(layout-cache driven pipeline-vs-vlm decision) will be added later; for
now, anything that needs OCR is routed to ``Backend.PIPELINE`` unless the
configured policy says otherwise.
The classifier is deliberately stateless. It loads the XGBoost model once
(lazily) and then exposes ``classify(pdf_path) -> RouterDecision``. No
caching, no I/O side effects β pure in, pure out.
"""
from __future__ import annotations
import random
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any
import numpy as np
import pymupdf
from pdfsys_core import Backend, RouterConfig
from .feature_extractor import PDFFeatureExtractor, flatten_per_page_features
from .xgb_model import XgbRouterModel, default_weights_path
@dataclass(slots=True)
class RouterDecision:
"""Result of running the Stage-A classifier on a single PDF."""
backend: Backend
ocr_prob: float
num_pages: int
is_form: bool
garbled_text_ratio: float
is_encrypted: bool
needs_password: bool
features: dict[str, Any] = field(default_factory=dict)
error: str | None = None
def as_record(self) -> dict[str, Any]:
"""Flat dict for JSONL emission."""
return {
"backend": self.backend.value,
"ocr_prob": self.ocr_prob,
"num_pages": self.num_pages,
"is_form": bool(self.is_form),
"garbled_text_ratio": float(self.garbled_text_ratio),
"is_encrypted": bool(self.is_encrypted),
"needs_password": bool(self.needs_password),
"error": self.error,
}
class Router:
"""Stage-A router: PyMuPDF features β XGBoost β Backend."""
def __init__(
self,
config: RouterConfig | None = None,
model_path: str | Path | None = None,
num_pages_to_sample: int = 8,
ocr_threshold: float = 0.5,
seed: int = 42,
) -> None:
self.config = config or RouterConfig()
self.num_pages_to_sample = num_pages_to_sample
self.ocr_threshold = ocr_threshold
self.seed = seed
self._extractor = PDFFeatureExtractor(
num_chunks=1, num_pages_to_sample=num_pages_to_sample
)
self._model = XgbRouterModel(model_path or default_weights_path())
# ------------------------------------------------------------------ api
def classify(self, pdf_path: str | Path) -> RouterDecision:
"""Classify a PDF file. Never raises β errors are in ``decision.error``."""
path = Path(pdf_path)
try:
doc = pymupdf.open(str(path))
except Exception as e: # noqa: BLE001 β we want to capture anything
return RouterDecision(
backend=Backend.DEFERRED,
ocr_prob=float("nan"),
num_pages=0,
is_form=False,
garbled_text_ratio=0.0,
is_encrypted=False,
needs_password=False,
error=f"open_failed: {e}",
)
try:
return self._classify_doc(doc)
finally:
try:
doc.close()
except Exception:
pass
def classify_bytes(self, pdf_bytes: bytes) -> RouterDecision:
"""Same as :meth:`classify`, but from an in-memory buffer."""
import io
try:
doc = pymupdf.open(stream=io.BytesIO(pdf_bytes), filetype="pdf")
except Exception as e: # noqa: BLE001
return RouterDecision(
backend=Backend.DEFERRED,
ocr_prob=float("nan"),
num_pages=0,
is_form=False,
garbled_text_ratio=0.0,
is_encrypted=False,
needs_password=False,
error=f"open_failed: {e}",
)
try:
return self._classify_doc(doc)
finally:
try:
doc.close()
except Exception:
pass
# --------------------------------------------------------------- internal
def _classify_doc(self, doc: pymupdf.Document) -> RouterDecision:
# Seed the sampling RNGs so the same PDF always produces the same
# feature vector β critical for reproducibility and debugging.
random.seed(self.seed)
np.random.seed(self.seed)
try:
if doc.is_encrypted or doc.needs_pass:
return RouterDecision(
backend=Backend.DEFERRED,
ocr_prob=float("nan"),
num_pages=len(doc),
is_form=False,
garbled_text_ratio=0.0,
is_encrypted=bool(doc.is_encrypted),
needs_password=bool(doc.needs_pass),
error="encrypted_or_password_protected",
)
raw_chunks = self._extractor.extract_all_features(doc)
if not raw_chunks:
return RouterDecision(
backend=Backend.DEFERRED,
ocr_prob=float("nan"),
num_pages=len(doc),
is_form=False,
garbled_text_ratio=0.0,
is_encrypted=False,
needs_password=False,
error="no_pages_sampled",
)
flat = flatten_per_page_features(
raw_chunks[0], sample_to_k_page_features=self.num_pages_to_sample
)
ocr_prob = self._model.predict_proba(flat)
backend = self._route(ocr_prob)
return RouterDecision(
backend=backend,
ocr_prob=ocr_prob,
num_pages=len(doc),
is_form=bool(flat.get("is_form", False)),
garbled_text_ratio=float(flat.get("garbled_text_ratio", 0.0)),
is_encrypted=bool(doc.is_encrypted),
needs_password=bool(doc.needs_pass),
features=flat,
)
except Exception as e: # noqa: BLE001
return RouterDecision(
backend=Backend.DEFERRED,
ocr_prob=float("nan"),
num_pages=len(doc) if doc else 0,
is_form=False,
garbled_text_ratio=0.0,
is_encrypted=False,
needs_password=False,
error=f"classify_failed: {e}",
)
def _route(self, ocr_prob: float) -> Backend:
"""Map XGBoost probability + fleet policy β concrete Backend."""
if ocr_prob < self.ocr_threshold:
return Backend.MUPDF
# OCR needed. Stage-B would check LayoutCache for complex content
# here. For the MVP we have no layout cache yet, so honour the
# fleet VLM gate: if VLM is enabled we'd need Stage-B to decide,
# otherwise pipeline handles everything flagged as scanned.
if self.config.vlm_enabled:
return Backend.DEFERRED # Stage-B will run once layout is cached
return Backend.PIPELINE
|