Spaces:
Running
Running
File size: 1,901 Bytes
082d661 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 | #!/usr/bin/env python3
"""Run every available OCR backend against real scanned samples and write a
tracked report (backend/evals/ocr_backend_report.json).
python scripts/ocr_smoke.py
Reads backend/.env, so configured backends (e.g. MiniCPM) are exercised live.
Unavailable backends (missing deps/keys) are recorded with the reason.
"""
from __future__ import annotations
import json
import sys
from pathlib import Path
ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(ROOT / "backend"))
from app.config import get_settings # noqa: E402
from app.ocr.backends import build_ocr_registry # noqa: E402
from app.ocr.backends.healthcheck import run_ocr_backend_tests # noqa: E402
REPORT_PATH = ROOT / "backend" / "evals" / "ocr_backend_report.json"
def main() -> None:
s = get_settings()
reg = build_ocr_registry(s)
report = run_ocr_backend_tests(s, reg)
REPORT_PATH.write_text(json.dumps(report, indent=2))
print("\n" + "=" * 78)
print(f" OCR BACKEND REAL-EXTRACTION REPORT (mode={report['mode']})")
print("=" * 78)
print(f" {'backend':<12}{'tier':<8}{'available':<11}{'functional':<11}{'engine / reason'}")
print("-" * 78)
for b in report["backends"]:
if b["available"]:
case = b["cases"][0] if b["cases"] else {}
detail = f"{case.get('engine','')} ({case.get('chars',0)} chars, {case.get('latency_ms',0)}ms)"
func = "✓ yes" if b["ok"] else "✗ no"
else:
detail = b["requires"]
func = "—"
print(f" {b['name']:<12}{b['tier']:<8}{('yes' if b['available'] else 'no'):<11}{func:<11}{detail[:42]}")
print("-" * 78)
print(f" available : {report['available_backends']}")
print(f" functional: {report['functional_backends']}")
print(f" report → {REPORT_PATH}")
print("=" * 78 + "\n")
if __name__ == "__main__":
main()
|