ERP-DocIQ / scripts /ocr_smoke.py
kenmandal's picture
Deploy latest: ERP DocIQ NLQ chatbot + reasoning models (MiniCPM3-4B/Command R7B) + ERP fine-tuning + extreme OCR docs
082d661 verified
Raw
History Blame Contribute Delete
1.9 kB
#!/usr/bin/env python3
"""Run every available OCR backend against real scanned samples and write a
tracked report (backend/evals/ocr_backend_report.json).
python scripts/ocr_smoke.py
Reads backend/.env, so configured backends (e.g. MiniCPM) are exercised live.
Unavailable backends (missing deps/keys) are recorded with the reason.
"""
from __future__ import annotations
import json
import sys
from pathlib import Path
ROOT = Path(__file__).resolve().parent.parent
sys.path.insert(0, str(ROOT / "backend"))
from app.config import get_settings # noqa: E402
from app.ocr.backends import build_ocr_registry # noqa: E402
from app.ocr.backends.healthcheck import run_ocr_backend_tests # noqa: E402
REPORT_PATH = ROOT / "backend" / "evals" / "ocr_backend_report.json"
def main() -> None:
s = get_settings()
reg = build_ocr_registry(s)
report = run_ocr_backend_tests(s, reg)
REPORT_PATH.write_text(json.dumps(report, indent=2))
print("\n" + "=" * 78)
print(f" OCR BACKEND REAL-EXTRACTION REPORT (mode={report['mode']})")
print("=" * 78)
print(f" {'backend':<12}{'tier':<8}{'available':<11}{'functional':<11}{'engine / reason'}")
print("-" * 78)
for b in report["backends"]:
if b["available"]:
case = b["cases"][0] if b["cases"] else {}
detail = f"{case.get('engine','')} ({case.get('chars',0)} chars, {case.get('latency_ms',0)}ms)"
func = "βœ“ yes" if b["ok"] else "βœ— no"
else:
detail = b["requires"]
func = "β€”"
print(f" {b['name']:<12}{b['tier']:<8}{('yes' if b['available'] else 'no'):<11}{func:<11}{detail[:42]}")
print("-" * 78)
print(f" available : {report['available_backends']}")
print(f" functional: {report['functional_backends']}")
print(f" report β†’ {REPORT_PATH}")
print("=" * 78 + "\n")
if __name__ == "__main__":
main()