#!/usr/bin/env python3 """Run every available OCR backend against real scanned samples and write a tracked report (backend/evals/ocr_backend_report.json). python scripts/ocr_smoke.py Reads backend/.env, so configured backends (e.g. MiniCPM) are exercised live. Unavailable backends (missing deps/keys) are recorded with the reason. """ from __future__ import annotations import json import sys from pathlib import Path ROOT = Path(__file__).resolve().parent.parent sys.path.insert(0, str(ROOT / "backend")) from app.config import get_settings # noqa: E402 from app.ocr.backends import build_ocr_registry # noqa: E402 from app.ocr.backends.healthcheck import run_ocr_backend_tests # noqa: E402 REPORT_PATH = ROOT / "backend" / "evals" / "ocr_backend_report.json" def main() -> None: s = get_settings() reg = build_ocr_registry(s) report = run_ocr_backend_tests(s, reg) REPORT_PATH.write_text(json.dumps(report, indent=2)) print("\n" + "=" * 78) print(f" OCR BACKEND REAL-EXTRACTION REPORT (mode={report['mode']})") print("=" * 78) print(f" {'backend':<12}{'tier':<8}{'available':<11}{'functional':<11}{'engine / reason'}") print("-" * 78) for b in report["backends"]: if b["available"]: case = b["cases"][0] if b["cases"] else {} detail = f"{case.get('engine','')} ({case.get('chars',0)} chars, {case.get('latency_ms',0)}ms)" func = "✓ yes" if b["ok"] else "✗ no" else: detail = b["requires"] func = "—" print(f" {b['name']:<12}{b['tier']:<8}{('yes' if b['available'] else 'no'):<11}{func:<11}{detail[:42]}") print("-" * 78) print(f" available : {report['available_backends']}") print(f" functional: {report['functional_backends']}") print(f" report → {REPORT_PATH}") print("=" * 78 + "\n") if __name__ == "__main__": main()