from __future__ import annotations import argparse import json import re import sys import tempfile import time from pathlib import Path from typing import Any import fitz ROOT_DIR = Path(__file__).resolve().parent.parent if str(ROOT_DIR) not in sys.path: sys.path.insert(0, str(ROOT_DIR)) from app import main EXTRACTION_RE = re.compile(r"^(?:(?Pbest|arabic|arabic-max):)?(?P[a-z-]+)(?:@(?P[0-9.]+)x)?(?:-psm(?P\d+))?$") def make_limited_pdf(pdf_path: Path, page_limit: int | None) -> Path: if not page_limit: return pdf_path limited = Path(tempfile.gettempdir()) / f"{pdf_path.stem}-first-{page_limit}-pages.pdf" with fitz.open(pdf_path) as document: output = fitz.open() output.insert_pdf(document, from_page=0, to_page=min(page_limit, document.page_count) - 1) output.save(limited) return limited def text_metrics(text: str) -> dict[str, Any]: speech_text = main.prepare_text_for_speech(text) arabic_words = main.ARABIC_RE.findall(speech_text) placeholder_count = speech_text.count("?") + speech_text.count("\ufffd") common_hits = sum(1 for word in arabic_words if word in main.COMMON_ARABIC_WORDS) quality = main.assess_text_quality(text, speech_text) return { "characters": len(text), "speechCharacters": len(speech_text), "arabicWords": len(arabic_words), "placeholderCharacters": placeholder_count, "placeholderRatio": round(placeholder_count / max(len(speech_text), 1), 4), "commonArabicWords": common_hits, "commonArabicWordRatio": round(common_hits / max(len(arabic_words), 1), 4), "singleArabicWords": int(quality["metrics"]["singleArabicWords"]), "singleArabicWordRatio": quality["metrics"]["singleArabicWordRatio"], "fragmentLines": int(quality["metrics"]["fragmentLines"]), "fragmentLineRatio": quality["metrics"]["fragmentLineRatio"], "quality": quality["quality"], "qualityScore": quality["score"], "qualityReasons": quality["reasons"], "preview": text[:180], "speechPreview": speech_text[:180], } def recommendation_for_extraction(extraction: str | None) -> dict[str, Any] | None: if not extraction: return None match = EXTRACTION_RE.match(extraction) if not match: return None engine = match.group("engine") zoom = match.group("zoom") psm = match.group("psm") env: dict[str, str] = {} notes: list[str] = [] if engine == "embedded": return { "summary": "This PDF has readable embedded text; OCR settings are not needed.", "env": {}, "notes": ["Use the normal upload flow."], } mode = match.group("mode") if mode in {"arabic", "arabic-max"}: env["OCR_ENGINE"] = mode if mode == "arabic-max": notes.append("Use Maximum Arabic OCR for the full run only if the worker can handle the heavier OCR comparison.") else: notes.append("Use Arabic specialist OCR for the full run; it compares Arabic-trained OCR outputs.") return { "summary": f"For the full book, use OCR_ENGINE={mode}.", "env": env, "notes": notes, } if engine not in { "easyocr", "qari-ocr", "tawkeed-ocr", "katib-ocr", "arabic-qwen-ocr", "arabic-glm-ocr", "baseer-ocr", "paddleocr", "paddleocr-vl", "surya", "tesseract", }: return None env["OCR_ENGINE"] = engine if zoom: if engine == "easyocr": env["EASYOCR_RENDER_ZOOM"] = zoom elif engine == "qari-ocr": env["QARI_OCR_RENDER_ZOOM"] = zoom elif engine == "tawkeed-ocr": env["TAWKEED_OCR_RENDER_ZOOM"] = zoom elif engine == "katib-ocr": env["KATIB_OCR_RENDER_ZOOM"] = zoom elif engine == "arabic-qwen-ocr": env["ARABIC_QWEN_OCR_RENDER_ZOOM"] = zoom elif engine == "arabic-glm-ocr": env["ARABIC_GLM_OCR_RENDER_ZOOM"] = zoom elif engine == "baseer-ocr": env["BASEER_OCR_RENDER_ZOOM"] = zoom elif engine == "paddleocr": env["PADDLEOCR_RENDER_ZOOM"] = zoom elif engine == "paddleocr-vl": env["PADDLEOCR_VL_RENDER_ZOOM"] = zoom elif engine == "surya": env["SURYA_RENDER_ZOOM"] = zoom elif engine == "tesseract": env["OCR_RENDER_ZOOM"] = zoom if psm and engine == "tesseract": env["TESSERACT_PSM"] = psm if engine == "tesseract-fast": env["OCR_ENGINE"] = engine env["OCR_RENDER_ZOOM"] = zoom or "1.5" env["TESSERACT_PSM"] = psm or "6" notes.append("Use this runner-up setting when speed matters and its sample text still sounds correct.") return { "summary": "For the full book, use OCR_ENGINE=tesseract-fast OCR_RENDER_ZOOM=1.5 TESSERACT_PSM=6.", "env": env, "notes": notes, } if engine == "tesseract": notes.append("Confirm Tesseract Arabic data is installed before the full run.") elif engine == "easyocr": notes.append("Use the EasyOCR/SILMA sidecar environment for the full run.") elif engine == "qari-ocr": notes.append("Use the QARI-OCR Arabic VLM sidecar on a GPU or strong worker; expect much higher RAM/runtime.") elif engine == "tawkeed-ocr": notes.append("Use the Tawkeed Arabic OCR sidecar when QARI 4B is too heavy; benchmark it on a short sample first.") elif engine == "katib-ocr": notes.append("Use the KATIB Arabic OCR sidecar for a smaller Arabic-trained VLM; benchmark it on a short sample first.") elif engine == "arabic-qwen-ocr": notes.append("Use the Arabic-Qwen3.5 OCR sidecar for a 0.9B Arabic-trained VLM; benchmark it on a short sample first.") elif engine == "arabic-glm-ocr": notes.append("Use the Arabic-GLM OCR sidecar for a recent Arabic-trained OCR VLM; benchmark it on a short sample first.") elif engine == "baseer-ocr": notes.append("Use the Baseer Arabic OCR sidecar for complex Arabic document layouts; benchmark it on a short sample first.") elif engine == "paddleocr": notes.append("Use the PaddleOCR sidecar environment for the full run.") elif engine == "paddleocr-vl": notes.append("Use the PaddleOCR-VL sidecar on a strong worker; expect much higher RAM/runtime than PaddleOCR.") elif engine == "surya": notes.append("Use the Surya heavy-worker sidecar; expect higher RAM/runtime than PaddleOCR.") env_text = " ".join(f"{key}={value}" for key, value in env.items()) return { "summary": f"For the full book, use {env_text}.", "env": env, "notes": notes, } def benchmark_engine(pdf_path: Path, engine: str) -> dict[str, Any]: previous_engine = main.OCR_ENGINE main.OCR_ENGINE = engine job = main.Job(id="dry-run", filename=pdf_path.name, ocr_engine=engine) started = time.perf_counter() try: if engine == "tesseract-fast": text = main.ocr_pdf_text_with_tesseract(pdf_path, job, render_zoom=1.5, psm=6) job.ocr_engine = engine else: text = main.extract_pdf_text(pdf_path, job) elapsed = round(time.perf_counter() - started, 2) result = { "engine": engine, "ok": True, "seconds": elapsed, "pages": job.pages, "extraction": job.extraction, **text_metrics(text), } if engine == "tesseract-fast": result["recommendation"] = { "summary": "For the full book, use OCR_ENGINE=tesseract-fast.", "env": { "OCR_ENGINE": "tesseract-fast", "OCR_RENDER_ZOOM": "1.5", "TESSERACT_PSM": "6", }, "notes": ["Use this runner-up setting when speed matters and its sample text still sounds correct."], } else: result["recommendation"] = recommendation_for_extraction(job.extraction) return result except Exception as exc: elapsed = round(time.perf_counter() - started, 2) return { "engine": engine, "ok": False, "seconds": elapsed, "pages": job.pages, "error": str(exc), } finally: main.OCR_ENGINE = previous_engine def print_table(results: list[dict[str, Any]]) -> None: print("engine ok sec pages chars words quality score extraction") print("------------- ---- ----- ----- ------ ------ ------- ------- ----------") for item in results: print( f"{item['engine']:<13} " f"{str(item['ok']):<4} " f"{item['seconds']:>5} " f"{item.get('pages', 0):>5} " f"{item.get('characters', 0):>6} " f"{item.get('arabicWords', 0):>6} " f"{item.get('quality', '-'):>7} " f"{item.get('qualityScore', 0):>7} " f"{item.get('extraction', '-')}" ) successful = [item for item in results if item.get("ok")] if successful: best = max(successful, key=lambda item: (item.get("qualityScore", 0), item.get("arabicWords", 0))) recommendation = best.get("recommendation") if recommendation: print() print(f"Best full-book setting from this sample: {recommendation['summary']}") def main_cli() -> None: if hasattr(sys.stdout, "reconfigure"): sys.stdout.reconfigure(encoding="utf-8", errors="replace") if hasattr(sys.stderr, "reconfigure"): sys.stderr.reconfigure(encoding="utf-8", errors="replace") parser = argparse.ArgumentParser(description="Benchmark Arabic OCR engines on the same PDF.") parser.add_argument("pdf", type=Path, help="Arabic PDF to benchmark") parser.add_argument( "--engines", nargs="+", default=["easyocr", "paddleocr", "tesseract"], choices=[ "arabic", "arabic-max", "qari-ocr", "tawkeed-ocr", "katib-ocr", "arabic-qwen-ocr", "arabic-glm-ocr", "baseer-ocr", "easyocr", "paddleocr", "paddleocr-vl", "surya", "tesseract", "tesseract-fast", "auto", "best", ], ) parser.add_argument("--page-limit", type=int, default=1, help="Benchmark only the first N pages by default.") parser.add_argument("--json", action="store_true", help="Print full JSON results instead of a compact table.") args = parser.parse_args() if not args.pdf.exists(): raise FileNotFoundError(f"PDF not found: {args.pdf}") if args.page_limit is not None and args.page_limit < 1: raise ValueError("--page-limit must be 1 or greater.") benchmark_pdf = make_limited_pdf(args.pdf, args.page_limit) try: results = [benchmark_engine(benchmark_pdf, engine) for engine in args.engines] finally: if benchmark_pdf != args.pdf: benchmark_pdf.unlink(missing_ok=True) if args.json: print(json.dumps(results, ensure_ascii=False, indent=2)) else: print_table(results) print() print("Tip: use --json to inspect text previews and errors.") if __name__ == "__main__": main_cli()