File size: 1,689 Bytes
082d661
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
#!/usr/bin/env python3
"""Quick OCR backend tester.

    python scripts/test_ocr.py <sample_id_or_path> [--backend auto|minicpm|cohere|llamaparse|tesseract|easyocr|sidecar]

Examples:
    python scripts/test_ocr.py invoice_scanned_basic
    python scripts/test_ocr.py invoice_scanned_basic --backend minicpm
    python scripts/test_ocr.py /path/to/receipt.png --backend cohere
"""
from __future__ import annotations

import argparse
import sys
from pathlib import Path

ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(ROOT / "backend"))

from app.config import get_settings  # noqa: E402
from app.ocr.backends import build_ocr_registry  # noqa: E402


def resolve(arg: str, settings) -> Path | None:
    p = Path(arg)
    if p.exists():
        return p
    for ext in (".pdf", ".png", ".jpg", ".jpeg"):
        cand = settings.evals_dataset_dir / f"{arg}{ext}"
        if cand.exists():
            return cand
    return None


def main() -> None:
    ap = argparse.ArgumentParser()
    ap.add_argument("doc")
    ap.add_argument("--backend", default="auto")
    args = ap.parse_args()

    s = get_settings()
    path = resolve(args.doc, s)
    if not path:
        print(f"not found: {args.doc}")
        sys.exit(1)
    reg = build_ocr_registry(s)
    print(f"available backends: {reg.available_names()}")
    res, attempts = reg.extract(path, args.backend)
    print(f"\nattempts: {attempts}")
    print(f"\nengine={res.engine} tier={res.tier} pages={res.pages} "
          f"conf={res.confidence} chars={len(res.text)} simulated={res.simulated}")
    print("\n--- text (first 1200 chars) ---")
    print(res.text[:1200])


if __name__ == "__main__":
    main()