Spaces:

roger1024
/

DocPipe

Runtime error

File size: 7,232 Bytes

"""Stage-A classifier: decides text-ok (MUPDF) vs needs-ocr (PIPELINE/VLM).

This is the single public entry point of the router for the MVP. Stage-B
(layout-cache driven pipeline-vs-vlm decision) will be added later; for
now, anything that needs OCR is routed to ``Backend.PIPELINE`` unless the
configured policy says otherwise.

The classifier is deliberately stateless. It loads the XGBoost model once
(lazily) and then exposes ``classify(pdf_path) -> RouterDecision``. No
caching, no I/O side effects — pure in, pure out.
"""

from __future__ import annotations

import random
from dataclasses import dataclass, field
from pathlib import Path
from typing import Any

import numpy as np
import pymupdf

from pdfsys_core import Backend, RouterConfig

from .feature_extractor import PDFFeatureExtractor, flatten_per_page_features
from .xgb_model import XgbRouterModel, default_weights_path


@dataclass(slots=True)
class RouterDecision:
    """Result of running the Stage-A classifier on a single PDF."""

    backend: Backend
    ocr_prob: float
    num_pages: int
    is_form: bool
    garbled_text_ratio: float
    is_encrypted: bool
    needs_password: bool
    features: dict[str, Any] = field(default_factory=dict)
    error: str | None = None

    def as_record(self) -> dict[str, Any]:
        """Flat dict for JSONL emission."""
        return {
            "backend": self.backend.value,
            "ocr_prob": self.ocr_prob,
            "num_pages": self.num_pages,
            "is_form": bool(self.is_form),
            "garbled_text_ratio": float(self.garbled_text_ratio),
            "is_encrypted": bool(self.is_encrypted),
            "needs_password": bool(self.needs_password),
            "error": self.error,
        }


class Router:
    """Stage-A router: PyMuPDF features → XGBoost → Backend."""

    def __init__(
        self,
        config: RouterConfig | None = None,
        model_path: str | Path | None = None,
        num_pages_to_sample: int = 8,
        ocr_threshold: float = 0.5,
        seed: int = 42,
    ) -> None:
        self.config = config or RouterConfig()
        self.num_pages_to_sample = num_pages_to_sample
        self.ocr_threshold = ocr_threshold
        self.seed = seed
        self._extractor = PDFFeatureExtractor(
            num_chunks=1, num_pages_to_sample=num_pages_to_sample
        )
        self._model = XgbRouterModel(model_path or default_weights_path())

    # ------------------------------------------------------------------ api

    def classify(self, pdf_path: str | Path) -> RouterDecision:
        """Classify a PDF file. Never raises — errors are in ``decision.error``."""
        path = Path(pdf_path)
        try:
            doc = pymupdf.open(str(path))
        except Exception as e:  # noqa: BLE001 — we want to capture anything
            return RouterDecision(
                backend=Backend.DEFERRED,
                ocr_prob=float("nan"),
                num_pages=0,
                is_form=False,
                garbled_text_ratio=0.0,
                is_encrypted=False,
                needs_password=False,
                error=f"open_failed: {e}",
            )

        try:
            return self._classify_doc(doc)
        finally:
            try:
                doc.close()
            except Exception:
                pass

    def classify_bytes(self, pdf_bytes: bytes) -> RouterDecision:
        """Same as :meth:`classify`, but from an in-memory buffer."""
        import io

        try:
            doc = pymupdf.open(stream=io.BytesIO(pdf_bytes), filetype="pdf")
        except Exception as e:  # noqa: BLE001
            return RouterDecision(
                backend=Backend.DEFERRED,
                ocr_prob=float("nan"),
                num_pages=0,
                is_form=False,
                garbled_text_ratio=0.0,
                is_encrypted=False,
                needs_password=False,
                error=f"open_failed: {e}",
            )
        try:
            return self._classify_doc(doc)
        finally:
            try:
                doc.close()
            except Exception:
                pass

    # --------------------------------------------------------------- internal

    def _classify_doc(self, doc: pymupdf.Document) -> RouterDecision:
        # Seed the sampling RNGs so the same PDF always produces the same
        # feature vector — critical for reproducibility and debugging.
        random.seed(self.seed)
        np.random.seed(self.seed)

        try:
            if doc.is_encrypted or doc.needs_pass:
                return RouterDecision(
                    backend=Backend.DEFERRED,
                    ocr_prob=float("nan"),
                    num_pages=len(doc),
                    is_form=False,
                    garbled_text_ratio=0.0,
                    is_encrypted=bool(doc.is_encrypted),
                    needs_password=bool(doc.needs_pass),
                    error="encrypted_or_password_protected",
                )

            raw_chunks = self._extractor.extract_all_features(doc)
            if not raw_chunks:
                return RouterDecision(
                    backend=Backend.DEFERRED,
                    ocr_prob=float("nan"),
                    num_pages=len(doc),
                    is_form=False,
                    garbled_text_ratio=0.0,
                    is_encrypted=False,
                    needs_password=False,
                    error="no_pages_sampled",
                )

            flat = flatten_per_page_features(
                raw_chunks[0], sample_to_k_page_features=self.num_pages_to_sample
            )
            ocr_prob = self._model.predict_proba(flat)

            backend = self._route(ocr_prob)
            return RouterDecision(
                backend=backend,
                ocr_prob=ocr_prob,
                num_pages=len(doc),
                is_form=bool(flat.get("is_form", False)),
                garbled_text_ratio=float(flat.get("garbled_text_ratio", 0.0)),
                is_encrypted=bool(doc.is_encrypted),
                needs_password=bool(doc.needs_pass),
                features=flat,
            )
        except Exception as e:  # noqa: BLE001
            return RouterDecision(
                backend=Backend.DEFERRED,
                ocr_prob=float("nan"),
                num_pages=len(doc) if doc else 0,
                is_form=False,
                garbled_text_ratio=0.0,
                is_encrypted=False,
                needs_password=False,
                error=f"classify_failed: {e}",
            )

    def _route(self, ocr_prob: float) -> Backend:
        """Map XGBoost probability + fleet policy → concrete Backend."""
        if ocr_prob < self.ocr_threshold:
            return Backend.MUPDF
        # OCR needed. Stage-B would check LayoutCache for complex content
        # here. For the MVP we have no layout cache yet, so honour the
        # fleet VLM gate: if VLM is enabled we'd need Stage-B to decide,
        # otherwise pipeline handles everything flagged as scanned.
        if self.config.vlm_enabled:
            return Backend.DEFERRED  # Stage-B will run once layout is cached
        return Backend.PIPELINE