Spaces:
Sleeping
Sleeping
| #!/usr/bin/env python3 | |
| """Quick OCR backend tester. | |
| python scripts/test_ocr.py <sample_id_or_path> [--backend auto|minicpm|cohere|llamaparse|tesseract|easyocr|sidecar] | |
| Examples: | |
| python scripts/test_ocr.py invoice_scanned_basic | |
| python scripts/test_ocr.py invoice_scanned_basic --backend minicpm | |
| python scripts/test_ocr.py /path/to/receipt.png --backend cohere | |
| """ | |
| from __future__ import annotations | |
| import argparse | |
| import sys | |
| from pathlib import Path | |
| ROOT = Path(__file__).resolve().parent.parent | |
| sys.path.insert(0, str(ROOT / "backend")) | |
| from app.config import get_settings # noqa: E402 | |
| from app.ocr.backends import build_ocr_registry # noqa: E402 | |
| def resolve(arg: str, settings) -> Path | None: | |
| p = Path(arg) | |
| if p.exists(): | |
| return p | |
| for ext in (".pdf", ".png", ".jpg", ".jpeg"): | |
| cand = settings.evals_dataset_dir / f"{arg}{ext}" | |
| if cand.exists(): | |
| return cand | |
| return None | |
| def main() -> None: | |
| ap = argparse.ArgumentParser() | |
| ap.add_argument("doc") | |
| ap.add_argument("--backend", default="auto") | |
| args = ap.parse_args() | |
| s = get_settings() | |
| path = resolve(args.doc, s) | |
| if not path: | |
| print(f"not found: {args.doc}") | |
| sys.exit(1) | |
| reg = build_ocr_registry(s) | |
| print(f"available backends: {reg.available_names()}") | |
| res, attempts = reg.extract(path, args.backend) | |
| print(f"\nattempts: {attempts}") | |
| print(f"\nengine={res.engine} tier={res.tier} pages={res.pages} " | |
| f"conf={res.confidence} chars={len(res.text)} simulated={res.simulated}") | |
| print("\n--- text (first 1200 chars) ---") | |
| print(res.text[:1200]) | |
| if __name__ == "__main__": | |
| main() | |