yin
feat(mvp): wire router β†’ mupdf parser β†’ OCR quality scorer closed loop
d423504
raw
history blame contribute delete
867 Bytes
"""pdfsys-router β€” two-stage routing for the pdfsys extraction pipeline.
Stage A (cheap): classify text-ok vs needs-ocr from PyMuPDF features, using
a ported FinePDFs XGBoost classifier over 124 hand-crafted features.
Stage B (uses layout cache): for needs-ocr, read the LayoutDocument written
by pdfsys-layout-analyser and decide pipeline vs vlm based on whether
complex regions (tables / formulas) exist. Stage B is not in the MVP.
"""
from __future__ import annotations
from .classifier import Router, RouterDecision
from .feature_extractor import PDFFeatureExtractor, flatten_per_page_features
from .xgb_model import XgbRouterModel, default_weights_path
__version__ = "0.0.1"
__all__ = [
"__version__",
"Router",
"RouterDecision",
"PDFFeatureExtractor",
"flatten_per_page_features",
"XgbRouterModel",
"default_weights_path",
]