#!/usr/bin/env python3 """Quick OCR backend tester. python scripts/test_ocr.py [--backend auto|minicpm|cohere|llamaparse|tesseract|easyocr|sidecar] Examples: python scripts/test_ocr.py invoice_scanned_basic python scripts/test_ocr.py invoice_scanned_basic --backend minicpm python scripts/test_ocr.py /path/to/receipt.png --backend cohere """ from __future__ import annotations import argparse import sys from pathlib import Path ROOT = Path(__file__).resolve().parent.parent sys.path.insert(0, str(ROOT / "backend")) from app.config import get_settings # noqa: E402 from app.ocr.backends import build_ocr_registry # noqa: E402 def resolve(arg: str, settings) -> Path | None: p = Path(arg) if p.exists(): return p for ext in (".pdf", ".png", ".jpg", ".jpeg"): cand = settings.evals_dataset_dir / f"{arg}{ext}" if cand.exists(): return cand return None def main() -> None: ap = argparse.ArgumentParser() ap.add_argument("doc") ap.add_argument("--backend", default="auto") args = ap.parse_args() s = get_settings() path = resolve(args.doc, s) if not path: print(f"not found: {args.doc}") sys.exit(1) reg = build_ocr_registry(s) print(f"available backends: {reg.available_names()}") res, attempts = reg.extract(path, args.backend) print(f"\nattempts: {attempts}") print(f"\nengine={res.engine} tier={res.tier} pages={res.pages} " f"conf={res.confidence} chars={len(res.text)} simulated={res.simulated}") print("\n--- text (first 1200 chars) ---") print(res.text[:1200]) if __name__ == "__main__": main()