| |
| """Run the OCR output-quality + document-analysis benchmark across all available |
| backends (OpenBMB MiniCPM-V, Cohere Aya-Vision, Tesseract, sidecar) and PUBLISH the |
| results. |
| |
| python scripts/ocr_quality.py |
| |
| Writes: |
| backend/evals/ocr_quality_report.json (committed, tracked) |
| <writable>/metrics_snapshots/ocr_quality_<ts>.json (published snapshot) |
| """ |
| from __future__ import annotations |
|
|
| import json |
| import sys |
| import time |
| from pathlib import Path |
|
|
| ROOT = Path(__file__).resolve().parent.parent |
| sys.path.insert(0, str(ROOT / "backend")) |
|
|
| from app.config import get_settings |
| from app.db import Database |
| from app.metrics import MetricsStore |
| from app.ocr.backends import build_ocr_registry |
| from app.ocr.quality import run_ocr_quality |
| from app.providers import build_registry |
| from app.rag_store import VectorStore |
| from app.router import ModelRouter |
|
|
| REPORT = ROOT / "backend" / "evals" / "ocr_quality_report.json" |
|
|
|
|
| def main() -> None: |
| s = get_settings() |
| metrics = MetricsStore(s.metrics_db_path) |
| router = ModelRouter(build_registry(s), s, metrics) |
| ocr = build_ocr_registry(s) |
| db = Database(s.app_db_path) |
| rag = VectorStore(s.rag_db_path) |
|
|
| report = run_ocr_quality(s, ocr, router, metrics, db=db, rag_store=rag) |
| REPORT.write_text(json.dumps(report, indent=2)) |
| snap_dir = s.writable_dir / "metrics_snapshots" |
| snap_dir.mkdir(parents=True, exist_ok=True) |
| (snap_dir / f"ocr_quality_{time.strftime('%Y%m%dT%H%M%S')}.json").write_text(json.dumps(report, indent=2)) |
|
|
| pct = lambda v: "n/a" if v is None else f"{v*100:.1f}%" |
| print("\n" + "=" * 90) |
| print(" OCR OUTPUT QUALITY + DOCUMENT ANALYSIS (smaller CER/WER = better; higher field-acc = better)") |
| print("=" * 90) |
| print(f" {'backend':<11}{'model':<17}{'params':>7}{'CER':>8}{'WER':>8}{'field-exact':>13}{'F1':>8}{'lat(ms)':>9}{'$/doc':>9}") |
| print("-" * 90) |
| for r in report["backends"]: |
| params = f"{r['params_b']}B" if r.get("params_b") else "—" |
| print(f" {r['backend']:<11}{(r.get('model') or '')[:16]:<17}{params:>7}" |
| f"{pct(r['cer']):>8}{pct(r['wer']):>8}{pct(r['field_exact_match']):>13}" |
| f"{pct(r['field_f1']):>8}{(r['avg_latency_ms'] or 0):>9.0f}{(r['avg_cost_usd'] or 0):>9.5f}") |
| print("-" * 90) |
| print(f" best OCR text quality : {report['best_ocr_quality']}") |
| print(f" best document analysis : {report['best_document_analysis']}") |
| print(f" published → {REPORT}") |
| print("=" * 90 + "\n") |
|
|
|
|
| if __name__ == "__main__": |
| main() |
|
|