Spaces:

Syncre
/

arabic-audio-reader-worker

Running

App Files Files Community

arabic-audio-reader-worker / scripts /benchmark_ocr.py

Syncre

Deploy Arabic Audio Reader worker

985cdbe verified about 9 hours ago

raw

history blame contribute delete

11.6 kB

	from __future__ import annotations

	import argparse
	import json
	import re
	import sys
	import tempfile
	import time
	from pathlib import Path
	from typing import Any

	import fitz

	ROOT_DIR = Path(__file__).resolve().parent.parent
	if str(ROOT_DIR) not in sys.path:
	sys.path.insert(0, str(ROOT_DIR))

	from app import main


	EXTRACTION_RE = re.compile(r"^(?:(?P<mode>best\|arabic\|arabic-max):)?(?P<engine>[a-z-]+)(?:@(?P<zoom>[0-9.]+)x)?(?:-psm(?P<psm>\d+))?$")


	def make_limited_pdf(pdf_path: Path, page_limit: int \| None) -> Path:
	if not page_limit:
	return pdf_path
	limited = Path(tempfile.gettempdir()) / f"{pdf_path.stem}-first-{page_limit}-pages.pdf"
	with fitz.open(pdf_path) as document:
	output = fitz.open()
	output.insert_pdf(document, from_page=0, to_page=min(page_limit, document.page_count) - 1)
	output.save(limited)
	return limited


	def text_metrics(text: str) -> dict[str, Any]:
	speech_text = main.prepare_text_for_speech(text)
	arabic_words = main.ARABIC_RE.findall(speech_text)
	placeholder_count = speech_text.count("?") + speech_text.count("\ufffd")
	common_hits = sum(1 for word in arabic_words if word in main.COMMON_ARABIC_WORDS)
	quality = main.assess_text_quality(text, speech_text)
	return {
	"characters": len(text),
	"speechCharacters": len(speech_text),
	"arabicWords": len(arabic_words),
	"placeholderCharacters": placeholder_count,
	"placeholderRatio": round(placeholder_count / max(len(speech_text), 1), 4),
	"commonArabicWords": common_hits,
	"commonArabicWordRatio": round(common_hits / max(len(arabic_words), 1), 4),
	"singleArabicWords": int(quality["metrics"]["singleArabicWords"]),
	"singleArabicWordRatio": quality["metrics"]["singleArabicWordRatio"],
	"fragmentLines": int(quality["metrics"]["fragmentLines"]),
	"fragmentLineRatio": quality["metrics"]["fragmentLineRatio"],
	"quality": quality["quality"],
	"qualityScore": quality["score"],
	"qualityReasons": quality["reasons"],
	"preview": text[:180],
	"speechPreview": speech_text[:180],
	}


	def recommendation_for_extraction(extraction: str \| None) -> dict[str, Any] \| None:
	if not extraction:
	return None
	match = EXTRACTION_RE.match(extraction)
	if not match:
	return None

	engine = match.group("engine")
	zoom = match.group("zoom")
	psm = match.group("psm")
	env: dict[str, str] = {}
	notes: list[str] = []

	if engine == "embedded":
	return {
	"summary": "This PDF has readable embedded text; OCR settings are not needed.",
	"env": {},
	"notes": ["Use the normal upload flow."],
	}

	mode = match.group("mode")
	if mode in {"arabic", "arabic-max"}:
	env["OCR_ENGINE"] = mode
	if mode == "arabic-max":
	notes.append("Use Maximum Arabic OCR for the full run only if the worker can handle the heavier OCR comparison.")
	else:
	notes.append("Use Arabic specialist OCR for the full run; it compares Arabic-trained OCR outputs.")
	return {
	"summary": f"For the full book, use OCR_ENGINE={mode}.",
	"env": env,
	"notes": notes,
	}

	if engine not in {
	"easyocr",
	"qari-ocr",
	"tawkeed-ocr",
	"katib-ocr",
	"arabic-qwen-ocr",
	"arabic-glm-ocr",
	"baseer-ocr",
	"paddleocr",
	"paddleocr-vl",
	"surya",
	"tesseract",
	}:
	return None

	env["OCR_ENGINE"] = engine
	if zoom:
	if engine == "easyocr":
	env["EASYOCR_RENDER_ZOOM"] = zoom
	elif engine == "qari-ocr":
	env["QARI_OCR_RENDER_ZOOM"] = zoom
	elif engine == "tawkeed-ocr":
	env["TAWKEED_OCR_RENDER_ZOOM"] = zoom
	elif engine == "katib-ocr":
	env["KATIB_OCR_RENDER_ZOOM"] = zoom
	elif engine == "arabic-qwen-ocr":
	env["ARABIC_QWEN_OCR_RENDER_ZOOM"] = zoom
	elif engine == "arabic-glm-ocr":
	env["ARABIC_GLM_OCR_RENDER_ZOOM"] = zoom
	elif engine == "baseer-ocr":
	env["BASEER_OCR_RENDER_ZOOM"] = zoom
	elif engine == "paddleocr":
	env["PADDLEOCR_RENDER_ZOOM"] = zoom
	elif engine == "paddleocr-vl":
	env["PADDLEOCR_VL_RENDER_ZOOM"] = zoom
	elif engine == "surya":
	env["SURYA_RENDER_ZOOM"] = zoom
	elif engine == "tesseract":
	env["OCR_RENDER_ZOOM"] = zoom
	if psm and engine == "tesseract":
	env["TESSERACT_PSM"] = psm
	if engine == "tesseract-fast":
	env["OCR_ENGINE"] = engine
	env["OCR_RENDER_ZOOM"] = zoom or "1.5"
	env["TESSERACT_PSM"] = psm or "6"
	notes.append("Use this runner-up setting when speed matters and its sample text still sounds correct.")
	return {
	"summary": "For the full book, use OCR_ENGINE=tesseract-fast OCR_RENDER_ZOOM=1.5 TESSERACT_PSM=6.",
	"env": env,
	"notes": notes,
	}
	if engine == "tesseract":
	notes.append("Confirm Tesseract Arabic data is installed before the full run.")
	elif engine == "easyocr":
	notes.append("Use the EasyOCR/SILMA sidecar environment for the full run.")
	elif engine == "qari-ocr":
	notes.append("Use the QARI-OCR Arabic VLM sidecar on a GPU or strong worker; expect much higher RAM/runtime.")
	elif engine == "tawkeed-ocr":
	notes.append("Use the Tawkeed Arabic OCR sidecar when QARI 4B is too heavy; benchmark it on a short sample first.")
	elif engine == "katib-ocr":
	notes.append("Use the KATIB Arabic OCR sidecar for a smaller Arabic-trained VLM; benchmark it on a short sample first.")
	elif engine == "arabic-qwen-ocr":
	notes.append("Use the Arabic-Qwen3.5 OCR sidecar for a 0.9B Arabic-trained VLM; benchmark it on a short sample first.")
	elif engine == "arabic-glm-ocr":
	notes.append("Use the Arabic-GLM OCR sidecar for a recent Arabic-trained OCR VLM; benchmark it on a short sample first.")
	elif engine == "baseer-ocr":
	notes.append("Use the Baseer Arabic OCR sidecar for complex Arabic document layouts; benchmark it on a short sample first.")
	elif engine == "paddleocr":
	notes.append("Use the PaddleOCR sidecar environment for the full run.")
	elif engine == "paddleocr-vl":
	notes.append("Use the PaddleOCR-VL sidecar on a strong worker; expect much higher RAM/runtime than PaddleOCR.")
	elif engine == "surya":
	notes.append("Use the Surya heavy-worker sidecar; expect higher RAM/runtime than PaddleOCR.")

	env_text = " ".join(f"{key}={value}" for key, value in env.items())
	return {
	"summary": f"For the full book, use {env_text}.",
	"env": env,
	"notes": notes,
	}


	def benchmark_engine(pdf_path: Path, engine: str) -> dict[str, Any]:
	previous_engine = main.OCR_ENGINE
	main.OCR_ENGINE = engine
	job = main.Job(id="dry-run", filename=pdf_path.name, ocr_engine=engine)
	started = time.perf_counter()
	try:
	if engine == "tesseract-fast":
	text = main.ocr_pdf_text_with_tesseract(pdf_path, job, render_zoom=1.5, psm=6)
	job.ocr_engine = engine
	else:
	text = main.extract_pdf_text(pdf_path, job)
	elapsed = round(time.perf_counter() - started, 2)
	result = {
	"engine": engine,
	"ok": True,
	"seconds": elapsed,
	"pages": job.pages,
	"extraction": job.extraction,
	**text_metrics(text),
	}
	if engine == "tesseract-fast":
	result["recommendation"] = {
	"summary": "For the full book, use OCR_ENGINE=tesseract-fast.",
	"env": {
	"OCR_ENGINE": "tesseract-fast",
	"OCR_RENDER_ZOOM": "1.5",
	"TESSERACT_PSM": "6",
	},
	"notes": ["Use this runner-up setting when speed matters and its sample text still sounds correct."],
	}
	else:
	result["recommendation"] = recommendation_for_extraction(job.extraction)
	return result
	except Exception as exc:
	elapsed = round(time.perf_counter() - started, 2)
	return {
	"engine": engine,
	"ok": False,
	"seconds": elapsed,
	"pages": job.pages,
	"error": str(exc),
	}
	finally:
	main.OCR_ENGINE = previous_engine


	def print_table(results: list[dict[str, Any]]) -> None:
	print("engine ok sec pages chars words quality score extraction")
	print("------------- ---- ----- ----- ------ ------ ------- ------- ----------")
	for item in results:
	print(
	f"{item['engine']:<13} "
	f"{str(item['ok']):<4} "
	f"{item['seconds']:>5} "
	f"{item.get('pages', 0):>5} "
	f"{item.get('characters', 0):>6} "
	f"{item.get('arabicWords', 0):>6} "
	f"{item.get('quality', '-'):>7} "
	f"{item.get('qualityScore', 0):>7} "
	f"{item.get('extraction', '-')}"
	)
	successful = [item for item in results if item.get("ok")]
	if successful:
	best = max(successful, key=lambda item: (item.get("qualityScore", 0), item.get("arabicWords", 0)))
	recommendation = best.get("recommendation")
	if recommendation:
	print()
	print(f"Best full-book setting from this sample: {recommendation['summary']}")


	def main_cli() -> None:
	if hasattr(sys.stdout, "reconfigure"):
	sys.stdout.reconfigure(encoding="utf-8", errors="replace")
	if hasattr(sys.stderr, "reconfigure"):
	sys.stderr.reconfigure(encoding="utf-8", errors="replace")

	parser = argparse.ArgumentParser(description="Benchmark Arabic OCR engines on the same PDF.")
	parser.add_argument("pdf", type=Path, help="Arabic PDF to benchmark")
	parser.add_argument(
	"--engines",
	nargs="+",
	default=["easyocr", "paddleocr", "tesseract"],
	choices=[
	"arabic",
	"arabic-max",
	"qari-ocr",
	"tawkeed-ocr",
	"katib-ocr",
	"arabic-qwen-ocr",
	"arabic-glm-ocr",
	"baseer-ocr",
	"easyocr",
	"paddleocr",
	"paddleocr-vl",
	"surya",
	"tesseract",
	"tesseract-fast",
	"auto",
	"best",
	],
	)
	parser.add_argument("--page-limit", type=int, default=1, help="Benchmark only the first N pages by default.")
	parser.add_argument("--json", action="store_true", help="Print full JSON results instead of a compact table.")
	args = parser.parse_args()

	if not args.pdf.exists():
	raise FileNotFoundError(f"PDF not found: {args.pdf}")
	if args.page_limit is not None and args.page_limit < 1:
	raise ValueError("--page-limit must be 1 or greater.")

	benchmark_pdf = make_limited_pdf(args.pdf, args.page_limit)
	try:
	results = [benchmark_engine(benchmark_pdf, engine) for engine in args.engines]
	finally:
	if benchmark_pdf != args.pdf:
	benchmark_pdf.unlink(missing_ok=True)

	if args.json:
	print(json.dumps(results, ensure_ascii=False, indent=2))
	else:
	print_table(results)
	print()
	print("Tip: use --json to inspect text previews and errors.")


	if __name__ == "__main__":
	main_cli()