| from __future__ import annotations |
|
|
| import argparse |
| import json |
| import re |
| import sys |
| import tempfile |
| import time |
| from pathlib import Path |
| from typing import Any |
|
|
| import fitz |
|
|
| ROOT_DIR = Path(__file__).resolve().parent.parent |
| if str(ROOT_DIR) not in sys.path: |
| sys.path.insert(0, str(ROOT_DIR)) |
|
|
| from app import main |
|
|
|
|
| EXTRACTION_RE = re.compile(r"^(?:(?P<mode>best|arabic|arabic-max):)?(?P<engine>[a-z-]+)(?:@(?P<zoom>[0-9.]+)x)?(?:-psm(?P<psm>\d+))?$") |
|
|
|
|
| def make_limited_pdf(pdf_path: Path, page_limit: int | None) -> Path: |
| if not page_limit: |
| return pdf_path |
| limited = Path(tempfile.gettempdir()) / f"{pdf_path.stem}-first-{page_limit}-pages.pdf" |
| with fitz.open(pdf_path) as document: |
| output = fitz.open() |
| output.insert_pdf(document, from_page=0, to_page=min(page_limit, document.page_count) - 1) |
| output.save(limited) |
| return limited |
|
|
|
|
| def text_metrics(text: str) -> dict[str, Any]: |
| speech_text = main.prepare_text_for_speech(text) |
| arabic_words = main.ARABIC_RE.findall(speech_text) |
| placeholder_count = speech_text.count("?") + speech_text.count("\ufffd") |
| common_hits = sum(1 for word in arabic_words if word in main.COMMON_ARABIC_WORDS) |
| quality = main.assess_text_quality(text, speech_text) |
| return { |
| "characters": len(text), |
| "speechCharacters": len(speech_text), |
| "arabicWords": len(arabic_words), |
| "placeholderCharacters": placeholder_count, |
| "placeholderRatio": round(placeholder_count / max(len(speech_text), 1), 4), |
| "commonArabicWords": common_hits, |
| "commonArabicWordRatio": round(common_hits / max(len(arabic_words), 1), 4), |
| "singleArabicWords": int(quality["metrics"]["singleArabicWords"]), |
| "singleArabicWordRatio": quality["metrics"]["singleArabicWordRatio"], |
| "fragmentLines": int(quality["metrics"]["fragmentLines"]), |
| "fragmentLineRatio": quality["metrics"]["fragmentLineRatio"], |
| "quality": quality["quality"], |
| "qualityScore": quality["score"], |
| "qualityReasons": quality["reasons"], |
| "preview": text[:180], |
| "speechPreview": speech_text[:180], |
| } |
|
|
|
|
| def recommendation_for_extraction(extraction: str | None) -> dict[str, Any] | None: |
| if not extraction: |
| return None |
| match = EXTRACTION_RE.match(extraction) |
| if not match: |
| return None |
|
|
| engine = match.group("engine") |
| zoom = match.group("zoom") |
| psm = match.group("psm") |
| env: dict[str, str] = {} |
| notes: list[str] = [] |
|
|
| if engine == "embedded": |
| return { |
| "summary": "This PDF has readable embedded text; OCR settings are not needed.", |
| "env": {}, |
| "notes": ["Use the normal upload flow."], |
| } |
|
|
| mode = match.group("mode") |
| if mode in {"arabic", "arabic-max"}: |
| env["OCR_ENGINE"] = mode |
| if mode == "arabic-max": |
| notes.append("Use Maximum Arabic OCR for the full run only if the worker can handle the heavier OCR comparison.") |
| else: |
| notes.append("Use Arabic specialist OCR for the full run; it compares Arabic-trained OCR outputs.") |
| return { |
| "summary": f"For the full book, use OCR_ENGINE={mode}.", |
| "env": env, |
| "notes": notes, |
| } |
|
|
| if engine not in { |
| "easyocr", |
| "qari-ocr", |
| "tawkeed-ocr", |
| "katib-ocr", |
| "arabic-qwen-ocr", |
| "arabic-glm-ocr", |
| "baseer-ocr", |
| "paddleocr", |
| "paddleocr-vl", |
| "surya", |
| "tesseract", |
| }: |
| return None |
|
|
| env["OCR_ENGINE"] = engine |
| if zoom: |
| if engine == "easyocr": |
| env["EASYOCR_RENDER_ZOOM"] = zoom |
| elif engine == "qari-ocr": |
| env["QARI_OCR_RENDER_ZOOM"] = zoom |
| elif engine == "tawkeed-ocr": |
| env["TAWKEED_OCR_RENDER_ZOOM"] = zoom |
| elif engine == "katib-ocr": |
| env["KATIB_OCR_RENDER_ZOOM"] = zoom |
| elif engine == "arabic-qwen-ocr": |
| env["ARABIC_QWEN_OCR_RENDER_ZOOM"] = zoom |
| elif engine == "arabic-glm-ocr": |
| env["ARABIC_GLM_OCR_RENDER_ZOOM"] = zoom |
| elif engine == "baseer-ocr": |
| env["BASEER_OCR_RENDER_ZOOM"] = zoom |
| elif engine == "paddleocr": |
| env["PADDLEOCR_RENDER_ZOOM"] = zoom |
| elif engine == "paddleocr-vl": |
| env["PADDLEOCR_VL_RENDER_ZOOM"] = zoom |
| elif engine == "surya": |
| env["SURYA_RENDER_ZOOM"] = zoom |
| elif engine == "tesseract": |
| env["OCR_RENDER_ZOOM"] = zoom |
| if psm and engine == "tesseract": |
| env["TESSERACT_PSM"] = psm |
| if engine == "tesseract-fast": |
| env["OCR_ENGINE"] = engine |
| env["OCR_RENDER_ZOOM"] = zoom or "1.5" |
| env["TESSERACT_PSM"] = psm or "6" |
| notes.append("Use this runner-up setting when speed matters and its sample text still sounds correct.") |
| return { |
| "summary": "For the full book, use OCR_ENGINE=tesseract-fast OCR_RENDER_ZOOM=1.5 TESSERACT_PSM=6.", |
| "env": env, |
| "notes": notes, |
| } |
| if engine == "tesseract": |
| notes.append("Confirm Tesseract Arabic data is installed before the full run.") |
| elif engine == "easyocr": |
| notes.append("Use the EasyOCR/SILMA sidecar environment for the full run.") |
| elif engine == "qari-ocr": |
| notes.append("Use the QARI-OCR Arabic VLM sidecar on a GPU or strong worker; expect much higher RAM/runtime.") |
| elif engine == "tawkeed-ocr": |
| notes.append("Use the Tawkeed Arabic OCR sidecar when QARI 4B is too heavy; benchmark it on a short sample first.") |
| elif engine == "katib-ocr": |
| notes.append("Use the KATIB Arabic OCR sidecar for a smaller Arabic-trained VLM; benchmark it on a short sample first.") |
| elif engine == "arabic-qwen-ocr": |
| notes.append("Use the Arabic-Qwen3.5 OCR sidecar for a 0.9B Arabic-trained VLM; benchmark it on a short sample first.") |
| elif engine == "arabic-glm-ocr": |
| notes.append("Use the Arabic-GLM OCR sidecar for a recent Arabic-trained OCR VLM; benchmark it on a short sample first.") |
| elif engine == "baseer-ocr": |
| notes.append("Use the Baseer Arabic OCR sidecar for complex Arabic document layouts; benchmark it on a short sample first.") |
| elif engine == "paddleocr": |
| notes.append("Use the PaddleOCR sidecar environment for the full run.") |
| elif engine == "paddleocr-vl": |
| notes.append("Use the PaddleOCR-VL sidecar on a strong worker; expect much higher RAM/runtime than PaddleOCR.") |
| elif engine == "surya": |
| notes.append("Use the Surya heavy-worker sidecar; expect higher RAM/runtime than PaddleOCR.") |
|
|
| env_text = " ".join(f"{key}={value}" for key, value in env.items()) |
| return { |
| "summary": f"For the full book, use {env_text}.", |
| "env": env, |
| "notes": notes, |
| } |
|
|
|
|
| def benchmark_engine(pdf_path: Path, engine: str) -> dict[str, Any]: |
| previous_engine = main.OCR_ENGINE |
| main.OCR_ENGINE = engine |
| job = main.Job(id="dry-run", filename=pdf_path.name, ocr_engine=engine) |
| started = time.perf_counter() |
| try: |
| if engine == "tesseract-fast": |
| text = main.ocr_pdf_text_with_tesseract(pdf_path, job, render_zoom=1.5, psm=6) |
| job.ocr_engine = engine |
| else: |
| text = main.extract_pdf_text(pdf_path, job) |
| elapsed = round(time.perf_counter() - started, 2) |
| result = { |
| "engine": engine, |
| "ok": True, |
| "seconds": elapsed, |
| "pages": job.pages, |
| "extraction": job.extraction, |
| **text_metrics(text), |
| } |
| if engine == "tesseract-fast": |
| result["recommendation"] = { |
| "summary": "For the full book, use OCR_ENGINE=tesseract-fast.", |
| "env": { |
| "OCR_ENGINE": "tesseract-fast", |
| "OCR_RENDER_ZOOM": "1.5", |
| "TESSERACT_PSM": "6", |
| }, |
| "notes": ["Use this runner-up setting when speed matters and its sample text still sounds correct."], |
| } |
| else: |
| result["recommendation"] = recommendation_for_extraction(job.extraction) |
| return result |
| except Exception as exc: |
| elapsed = round(time.perf_counter() - started, 2) |
| return { |
| "engine": engine, |
| "ok": False, |
| "seconds": elapsed, |
| "pages": job.pages, |
| "error": str(exc), |
| } |
| finally: |
| main.OCR_ENGINE = previous_engine |
|
|
|
|
| def print_table(results: list[dict[str, Any]]) -> None: |
| print("engine ok sec pages chars words quality score extraction") |
| print("------------- ---- ----- ----- ------ ------ ------- ------- ----------") |
| for item in results: |
| print( |
| f"{item['engine']:<13} " |
| f"{str(item['ok']):<4} " |
| f"{item['seconds']:>5} " |
| f"{item.get('pages', 0):>5} " |
| f"{item.get('characters', 0):>6} " |
| f"{item.get('arabicWords', 0):>6} " |
| f"{item.get('quality', '-'):>7} " |
| f"{item.get('qualityScore', 0):>7} " |
| f"{item.get('extraction', '-')}" |
| ) |
| successful = [item for item in results if item.get("ok")] |
| if successful: |
| best = max(successful, key=lambda item: (item.get("qualityScore", 0), item.get("arabicWords", 0))) |
| recommendation = best.get("recommendation") |
| if recommendation: |
| print() |
| print(f"Best full-book setting from this sample: {recommendation['summary']}") |
|
|
|
|
| def main_cli() -> None: |
| if hasattr(sys.stdout, "reconfigure"): |
| sys.stdout.reconfigure(encoding="utf-8", errors="replace") |
| if hasattr(sys.stderr, "reconfigure"): |
| sys.stderr.reconfigure(encoding="utf-8", errors="replace") |
|
|
| parser = argparse.ArgumentParser(description="Benchmark Arabic OCR engines on the same PDF.") |
| parser.add_argument("pdf", type=Path, help="Arabic PDF to benchmark") |
| parser.add_argument( |
| "--engines", |
| nargs="+", |
| default=["easyocr", "paddleocr", "tesseract"], |
| choices=[ |
| "arabic", |
| "arabic-max", |
| "qari-ocr", |
| "tawkeed-ocr", |
| "katib-ocr", |
| "arabic-qwen-ocr", |
| "arabic-glm-ocr", |
| "baseer-ocr", |
| "easyocr", |
| "paddleocr", |
| "paddleocr-vl", |
| "surya", |
| "tesseract", |
| "tesseract-fast", |
| "auto", |
| "best", |
| ], |
| ) |
| parser.add_argument("--page-limit", type=int, default=1, help="Benchmark only the first N pages by default.") |
| parser.add_argument("--json", action="store_true", help="Print full JSON results instead of a compact table.") |
| args = parser.parse_args() |
|
|
| if not args.pdf.exists(): |
| raise FileNotFoundError(f"PDF not found: {args.pdf}") |
| if args.page_limit is not None and args.page_limit < 1: |
| raise ValueError("--page-limit must be 1 or greater.") |
|
|
| benchmark_pdf = make_limited_pdf(args.pdf, args.page_limit) |
| try: |
| results = [benchmark_engine(benchmark_pdf, engine) for engine in args.engines] |
| finally: |
| if benchmark_pdf != args.pdf: |
| benchmark_pdf.unlink(missing_ok=True) |
|
|
| if args.json: |
| print(json.dumps(results, ensure_ascii=False, indent=2)) |
| else: |
| print_table(results) |
| print() |
| print("Tip: use --json to inspect text previews and errors.") |
|
|
|
|
| if __name__ == "__main__": |
| main_cli() |
|
|