arabic-audio-reader-worker / scripts /benchmark_ocr.py
Syncre's picture
Deploy Arabic Audio Reader worker
985cdbe verified
from __future__ import annotations
import argparse
import json
import re
import sys
import tempfile
import time
from pathlib import Path
from typing import Any
import fitz
ROOT_DIR = Path(__file__).resolve().parent.parent
if str(ROOT_DIR) not in sys.path:
sys.path.insert(0, str(ROOT_DIR))
from app import main
EXTRACTION_RE = re.compile(r"^(?:(?P<mode>best|arabic|arabic-max):)?(?P<engine>[a-z-]+)(?:@(?P<zoom>[0-9.]+)x)?(?:-psm(?P<psm>\d+))?$")
def make_limited_pdf(pdf_path: Path, page_limit: int | None) -> Path:
if not page_limit:
return pdf_path
limited = Path(tempfile.gettempdir()) / f"{pdf_path.stem}-first-{page_limit}-pages.pdf"
with fitz.open(pdf_path) as document:
output = fitz.open()
output.insert_pdf(document, from_page=0, to_page=min(page_limit, document.page_count) - 1)
output.save(limited)
return limited
def text_metrics(text: str) -> dict[str, Any]:
speech_text = main.prepare_text_for_speech(text)
arabic_words = main.ARABIC_RE.findall(speech_text)
placeholder_count = speech_text.count("?") + speech_text.count("\ufffd")
common_hits = sum(1 for word in arabic_words if word in main.COMMON_ARABIC_WORDS)
quality = main.assess_text_quality(text, speech_text)
return {
"characters": len(text),
"speechCharacters": len(speech_text),
"arabicWords": len(arabic_words),
"placeholderCharacters": placeholder_count,
"placeholderRatio": round(placeholder_count / max(len(speech_text), 1), 4),
"commonArabicWords": common_hits,
"commonArabicWordRatio": round(common_hits / max(len(arabic_words), 1), 4),
"singleArabicWords": int(quality["metrics"]["singleArabicWords"]),
"singleArabicWordRatio": quality["metrics"]["singleArabicWordRatio"],
"fragmentLines": int(quality["metrics"]["fragmentLines"]),
"fragmentLineRatio": quality["metrics"]["fragmentLineRatio"],
"quality": quality["quality"],
"qualityScore": quality["score"],
"qualityReasons": quality["reasons"],
"preview": text[:180],
"speechPreview": speech_text[:180],
}
def recommendation_for_extraction(extraction: str | None) -> dict[str, Any] | None:
if not extraction:
return None
match = EXTRACTION_RE.match(extraction)
if not match:
return None
engine = match.group("engine")
zoom = match.group("zoom")
psm = match.group("psm")
env: dict[str, str] = {}
notes: list[str] = []
if engine == "embedded":
return {
"summary": "This PDF has readable embedded text; OCR settings are not needed.",
"env": {},
"notes": ["Use the normal upload flow."],
}
mode = match.group("mode")
if mode in {"arabic", "arabic-max"}:
env["OCR_ENGINE"] = mode
if mode == "arabic-max":
notes.append("Use Maximum Arabic OCR for the full run only if the worker can handle the heavier OCR comparison.")
else:
notes.append("Use Arabic specialist OCR for the full run; it compares Arabic-trained OCR outputs.")
return {
"summary": f"For the full book, use OCR_ENGINE={mode}.",
"env": env,
"notes": notes,
}
if engine not in {
"easyocr",
"qari-ocr",
"tawkeed-ocr",
"katib-ocr",
"arabic-qwen-ocr",
"arabic-glm-ocr",
"baseer-ocr",
"paddleocr",
"paddleocr-vl",
"surya",
"tesseract",
}:
return None
env["OCR_ENGINE"] = engine
if zoom:
if engine == "easyocr":
env["EASYOCR_RENDER_ZOOM"] = zoom
elif engine == "qari-ocr":
env["QARI_OCR_RENDER_ZOOM"] = zoom
elif engine == "tawkeed-ocr":
env["TAWKEED_OCR_RENDER_ZOOM"] = zoom
elif engine == "katib-ocr":
env["KATIB_OCR_RENDER_ZOOM"] = zoom
elif engine == "arabic-qwen-ocr":
env["ARABIC_QWEN_OCR_RENDER_ZOOM"] = zoom
elif engine == "arabic-glm-ocr":
env["ARABIC_GLM_OCR_RENDER_ZOOM"] = zoom
elif engine == "baseer-ocr":
env["BASEER_OCR_RENDER_ZOOM"] = zoom
elif engine == "paddleocr":
env["PADDLEOCR_RENDER_ZOOM"] = zoom
elif engine == "paddleocr-vl":
env["PADDLEOCR_VL_RENDER_ZOOM"] = zoom
elif engine == "surya":
env["SURYA_RENDER_ZOOM"] = zoom
elif engine == "tesseract":
env["OCR_RENDER_ZOOM"] = zoom
if psm and engine == "tesseract":
env["TESSERACT_PSM"] = psm
if engine == "tesseract-fast":
env["OCR_ENGINE"] = engine
env["OCR_RENDER_ZOOM"] = zoom or "1.5"
env["TESSERACT_PSM"] = psm or "6"
notes.append("Use this runner-up setting when speed matters and its sample text still sounds correct.")
return {
"summary": "For the full book, use OCR_ENGINE=tesseract-fast OCR_RENDER_ZOOM=1.5 TESSERACT_PSM=6.",
"env": env,
"notes": notes,
}
if engine == "tesseract":
notes.append("Confirm Tesseract Arabic data is installed before the full run.")
elif engine == "easyocr":
notes.append("Use the EasyOCR/SILMA sidecar environment for the full run.")
elif engine == "qari-ocr":
notes.append("Use the QARI-OCR Arabic VLM sidecar on a GPU or strong worker; expect much higher RAM/runtime.")
elif engine == "tawkeed-ocr":
notes.append("Use the Tawkeed Arabic OCR sidecar when QARI 4B is too heavy; benchmark it on a short sample first.")
elif engine == "katib-ocr":
notes.append("Use the KATIB Arabic OCR sidecar for a smaller Arabic-trained VLM; benchmark it on a short sample first.")
elif engine == "arabic-qwen-ocr":
notes.append("Use the Arabic-Qwen3.5 OCR sidecar for a 0.9B Arabic-trained VLM; benchmark it on a short sample first.")
elif engine == "arabic-glm-ocr":
notes.append("Use the Arabic-GLM OCR sidecar for a recent Arabic-trained OCR VLM; benchmark it on a short sample first.")
elif engine == "baseer-ocr":
notes.append("Use the Baseer Arabic OCR sidecar for complex Arabic document layouts; benchmark it on a short sample first.")
elif engine == "paddleocr":
notes.append("Use the PaddleOCR sidecar environment for the full run.")
elif engine == "paddleocr-vl":
notes.append("Use the PaddleOCR-VL sidecar on a strong worker; expect much higher RAM/runtime than PaddleOCR.")
elif engine == "surya":
notes.append("Use the Surya heavy-worker sidecar; expect higher RAM/runtime than PaddleOCR.")
env_text = " ".join(f"{key}={value}" for key, value in env.items())
return {
"summary": f"For the full book, use {env_text}.",
"env": env,
"notes": notes,
}
def benchmark_engine(pdf_path: Path, engine: str) -> dict[str, Any]:
previous_engine = main.OCR_ENGINE
main.OCR_ENGINE = engine
job = main.Job(id="dry-run", filename=pdf_path.name, ocr_engine=engine)
started = time.perf_counter()
try:
if engine == "tesseract-fast":
text = main.ocr_pdf_text_with_tesseract(pdf_path, job, render_zoom=1.5, psm=6)
job.ocr_engine = engine
else:
text = main.extract_pdf_text(pdf_path, job)
elapsed = round(time.perf_counter() - started, 2)
result = {
"engine": engine,
"ok": True,
"seconds": elapsed,
"pages": job.pages,
"extraction": job.extraction,
**text_metrics(text),
}
if engine == "tesseract-fast":
result["recommendation"] = {
"summary": "For the full book, use OCR_ENGINE=tesseract-fast.",
"env": {
"OCR_ENGINE": "tesseract-fast",
"OCR_RENDER_ZOOM": "1.5",
"TESSERACT_PSM": "6",
},
"notes": ["Use this runner-up setting when speed matters and its sample text still sounds correct."],
}
else:
result["recommendation"] = recommendation_for_extraction(job.extraction)
return result
except Exception as exc:
elapsed = round(time.perf_counter() - started, 2)
return {
"engine": engine,
"ok": False,
"seconds": elapsed,
"pages": job.pages,
"error": str(exc),
}
finally:
main.OCR_ENGINE = previous_engine
def print_table(results: list[dict[str, Any]]) -> None:
print("engine ok sec pages chars words quality score extraction")
print("------------- ---- ----- ----- ------ ------ ------- ------- ----------")
for item in results:
print(
f"{item['engine']:<13} "
f"{str(item['ok']):<4} "
f"{item['seconds']:>5} "
f"{item.get('pages', 0):>5} "
f"{item.get('characters', 0):>6} "
f"{item.get('arabicWords', 0):>6} "
f"{item.get('quality', '-'):>7} "
f"{item.get('qualityScore', 0):>7} "
f"{item.get('extraction', '-')}"
)
successful = [item for item in results if item.get("ok")]
if successful:
best = max(successful, key=lambda item: (item.get("qualityScore", 0), item.get("arabicWords", 0)))
recommendation = best.get("recommendation")
if recommendation:
print()
print(f"Best full-book setting from this sample: {recommendation['summary']}")
def main_cli() -> None:
if hasattr(sys.stdout, "reconfigure"):
sys.stdout.reconfigure(encoding="utf-8", errors="replace")
if hasattr(sys.stderr, "reconfigure"):
sys.stderr.reconfigure(encoding="utf-8", errors="replace")
parser = argparse.ArgumentParser(description="Benchmark Arabic OCR engines on the same PDF.")
parser.add_argument("pdf", type=Path, help="Arabic PDF to benchmark")
parser.add_argument(
"--engines",
nargs="+",
default=["easyocr", "paddleocr", "tesseract"],
choices=[
"arabic",
"arabic-max",
"qari-ocr",
"tawkeed-ocr",
"katib-ocr",
"arabic-qwen-ocr",
"arabic-glm-ocr",
"baseer-ocr",
"easyocr",
"paddleocr",
"paddleocr-vl",
"surya",
"tesseract",
"tesseract-fast",
"auto",
"best",
],
)
parser.add_argument("--page-limit", type=int, default=1, help="Benchmark only the first N pages by default.")
parser.add_argument("--json", action="store_true", help="Print full JSON results instead of a compact table.")
args = parser.parse_args()
if not args.pdf.exists():
raise FileNotFoundError(f"PDF not found: {args.pdf}")
if args.page_limit is not None and args.page_limit < 1:
raise ValueError("--page-limit must be 1 or greater.")
benchmark_pdf = make_limited_pdf(args.pdf, args.page_limit)
try:
results = [benchmark_engine(benchmark_pdf, engine) for engine in args.engines]
finally:
if benchmark_pdf != args.pdf:
benchmark_pdf.unlink(missing_ok=True)
if args.json:
print(json.dumps(results, ensure_ascii=False, indent=2))
else:
print_table(results)
print()
print("Tip: use --json to inspect text previews and errors.")
if __name__ == "__main__":
main_cli()