Spaces:

Syncre
/

arabic-audio-reader-worker

Running

File size: 11,610 Bytes

from __future__ import annotations

import argparse
import json
import re
import sys
import tempfile
import time
from pathlib import Path
from typing import Any

import fitz

ROOT_DIR = Path(__file__).resolve().parent.parent
if str(ROOT_DIR) not in sys.path:
    sys.path.insert(0, str(ROOT_DIR))

from app import main


EXTRACTION_RE = re.compile(r"^(?:(?P<mode>best|arabic|arabic-max):)?(?P<engine>[a-z-]+)(?:@(?P<zoom>[0-9.]+)x)?(?:-psm(?P<psm>\d+))?$")


def make_limited_pdf(pdf_path: Path, page_limit: int | None) -> Path:
    if not page_limit:
        return pdf_path
    limited = Path(tempfile.gettempdir()) / f"{pdf_path.stem}-first-{page_limit}-pages.pdf"
    with fitz.open(pdf_path) as document:
        output = fitz.open()
        output.insert_pdf(document, from_page=0, to_page=min(page_limit, document.page_count) - 1)
        output.save(limited)
    return limited


def text_metrics(text: str) -> dict[str, Any]:
    speech_text = main.prepare_text_for_speech(text)
    arabic_words = main.ARABIC_RE.findall(speech_text)
    placeholder_count = speech_text.count("?") + speech_text.count("\ufffd")
    common_hits = sum(1 for word in arabic_words if word in main.COMMON_ARABIC_WORDS)
    quality = main.assess_text_quality(text, speech_text)
    return {
        "characters": len(text),
        "speechCharacters": len(speech_text),
        "arabicWords": len(arabic_words),
        "placeholderCharacters": placeholder_count,
        "placeholderRatio": round(placeholder_count / max(len(speech_text), 1), 4),
        "commonArabicWords": common_hits,
        "commonArabicWordRatio": round(common_hits / max(len(arabic_words), 1), 4),
        "singleArabicWords": int(quality["metrics"]["singleArabicWords"]),
        "singleArabicWordRatio": quality["metrics"]["singleArabicWordRatio"],
        "fragmentLines": int(quality["metrics"]["fragmentLines"]),
        "fragmentLineRatio": quality["metrics"]["fragmentLineRatio"],
        "quality": quality["quality"],
        "qualityScore": quality["score"],
        "qualityReasons": quality["reasons"],
        "preview": text[:180],
        "speechPreview": speech_text[:180],
    }


def recommendation_for_extraction(extraction: str | None) -> dict[str, Any] | None:
    if not extraction:
        return None
    match = EXTRACTION_RE.match(extraction)
    if not match:
        return None

    engine = match.group("engine")
    zoom = match.group("zoom")
    psm = match.group("psm")
    env: dict[str, str] = {}
    notes: list[str] = []

    if engine == "embedded":
        return {
            "summary": "This PDF has readable embedded text; OCR settings are not needed.",
            "env": {},
            "notes": ["Use the normal upload flow."],
        }

    mode = match.group("mode")
    if mode in {"arabic", "arabic-max"}:
        env["OCR_ENGINE"] = mode
        if mode == "arabic-max":
            notes.append("Use Maximum Arabic OCR for the full run only if the worker can handle the heavier OCR comparison.")
        else:
            notes.append("Use Arabic specialist OCR for the full run; it compares Arabic-trained OCR outputs.")
        return {
            "summary": f"For the full book, use OCR_ENGINE={mode}.",
            "env": env,
            "notes": notes,
        }

    if engine not in {
        "easyocr",
        "qari-ocr",
        "tawkeed-ocr",
        "katib-ocr",
        "arabic-qwen-ocr",
        "arabic-glm-ocr",
        "baseer-ocr",
        "paddleocr",
        "paddleocr-vl",
        "surya",
        "tesseract",
    }:
        return None

    env["OCR_ENGINE"] = engine
    if zoom:
        if engine == "easyocr":
            env["EASYOCR_RENDER_ZOOM"] = zoom
        elif engine == "qari-ocr":
            env["QARI_OCR_RENDER_ZOOM"] = zoom
        elif engine == "tawkeed-ocr":
            env["TAWKEED_OCR_RENDER_ZOOM"] = zoom
        elif engine == "katib-ocr":
            env["KATIB_OCR_RENDER_ZOOM"] = zoom
        elif engine == "arabic-qwen-ocr":
            env["ARABIC_QWEN_OCR_RENDER_ZOOM"] = zoom
        elif engine == "arabic-glm-ocr":
            env["ARABIC_GLM_OCR_RENDER_ZOOM"] = zoom
        elif engine == "baseer-ocr":
            env["BASEER_OCR_RENDER_ZOOM"] = zoom
        elif engine == "paddleocr":
            env["PADDLEOCR_RENDER_ZOOM"] = zoom
        elif engine == "paddleocr-vl":
            env["PADDLEOCR_VL_RENDER_ZOOM"] = zoom
        elif engine == "surya":
            env["SURYA_RENDER_ZOOM"] = zoom
        elif engine == "tesseract":
            env["OCR_RENDER_ZOOM"] = zoom
    if psm and engine == "tesseract":
        env["TESSERACT_PSM"] = psm
    if engine == "tesseract-fast":
        env["OCR_ENGINE"] = engine
        env["OCR_RENDER_ZOOM"] = zoom or "1.5"
        env["TESSERACT_PSM"] = psm or "6"
        notes.append("Use this runner-up setting when speed matters and its sample text still sounds correct.")
        return {
            "summary": "For the full book, use OCR_ENGINE=tesseract-fast OCR_RENDER_ZOOM=1.5 TESSERACT_PSM=6.",
            "env": env,
            "notes": notes,
        }
    if engine == "tesseract":
        notes.append("Confirm Tesseract Arabic data is installed before the full run.")
    elif engine == "easyocr":
        notes.append("Use the EasyOCR/SILMA sidecar environment for the full run.")
    elif engine == "qari-ocr":
        notes.append("Use the QARI-OCR Arabic VLM sidecar on a GPU or strong worker; expect much higher RAM/runtime.")
    elif engine == "tawkeed-ocr":
        notes.append("Use the Tawkeed Arabic OCR sidecar when QARI 4B is too heavy; benchmark it on a short sample first.")
    elif engine == "katib-ocr":
        notes.append("Use the KATIB Arabic OCR sidecar for a smaller Arabic-trained VLM; benchmark it on a short sample first.")
    elif engine == "arabic-qwen-ocr":
        notes.append("Use the Arabic-Qwen3.5 OCR sidecar for a 0.9B Arabic-trained VLM; benchmark it on a short sample first.")
    elif engine == "arabic-glm-ocr":
        notes.append("Use the Arabic-GLM OCR sidecar for a recent Arabic-trained OCR VLM; benchmark it on a short sample first.")
    elif engine == "baseer-ocr":
        notes.append("Use the Baseer Arabic OCR sidecar for complex Arabic document layouts; benchmark it on a short sample first.")
    elif engine == "paddleocr":
        notes.append("Use the PaddleOCR sidecar environment for the full run.")
    elif engine == "paddleocr-vl":
        notes.append("Use the PaddleOCR-VL sidecar on a strong worker; expect much higher RAM/runtime than PaddleOCR.")
    elif engine == "surya":
        notes.append("Use the Surya heavy-worker sidecar; expect higher RAM/runtime than PaddleOCR.")

    env_text = " ".join(f"{key}={value}" for key, value in env.items())
    return {
        "summary": f"For the full book, use {env_text}.",
        "env": env,
        "notes": notes,
    }


def benchmark_engine(pdf_path: Path, engine: str) -> dict[str, Any]:
    previous_engine = main.OCR_ENGINE
    main.OCR_ENGINE = engine
    job = main.Job(id="dry-run", filename=pdf_path.name, ocr_engine=engine)
    started = time.perf_counter()
    try:
        if engine == "tesseract-fast":
            text = main.ocr_pdf_text_with_tesseract(pdf_path, job, render_zoom=1.5, psm=6)
            job.ocr_engine = engine
        else:
            text = main.extract_pdf_text(pdf_path, job)
        elapsed = round(time.perf_counter() - started, 2)
        result = {
            "engine": engine,
            "ok": True,
            "seconds": elapsed,
            "pages": job.pages,
            "extraction": job.extraction,
            **text_metrics(text),
        }
        if engine == "tesseract-fast":
            result["recommendation"] = {
                "summary": "For the full book, use OCR_ENGINE=tesseract-fast.",
                "env": {
                    "OCR_ENGINE": "tesseract-fast",
                    "OCR_RENDER_ZOOM": "1.5",
                    "TESSERACT_PSM": "6",
                },
                "notes": ["Use this runner-up setting when speed matters and its sample text still sounds correct."],
            }
        else:
            result["recommendation"] = recommendation_for_extraction(job.extraction)
        return result
    except Exception as exc:
        elapsed = round(time.perf_counter() - started, 2)
        return {
            "engine": engine,
            "ok": False,
            "seconds": elapsed,
            "pages": job.pages,
            "error": str(exc),
        }
    finally:
        main.OCR_ENGINE = previous_engine


def print_table(results: list[dict[str, Any]]) -> None:
    print("engine         ok    sec    pages  chars   words   quality  score    extraction")
    print("-------------  ----  -----  -----  ------  ------  -------  -------  ----------")
    for item in results:
        print(
            f"{item['engine']:<13}  "
            f"{str(item['ok']):<4}  "
            f"{item['seconds']:>5}  "
            f"{item.get('pages', 0):>5}  "
            f"{item.get('characters', 0):>6}  "
            f"{item.get('arabicWords', 0):>6}  "
            f"{item.get('quality', '-'):>7}  "
            f"{item.get('qualityScore', 0):>7}  "
            f"{item.get('extraction', '-')}"
        )
    successful = [item for item in results if item.get("ok")]
    if successful:
        best = max(successful, key=lambda item: (item.get("qualityScore", 0), item.get("arabicWords", 0)))
        recommendation = best.get("recommendation")
        if recommendation:
            print()
            print(f"Best full-book setting from this sample: {recommendation['summary']}")


def main_cli() -> None:
    if hasattr(sys.stdout, "reconfigure"):
        sys.stdout.reconfigure(encoding="utf-8", errors="replace")
    if hasattr(sys.stderr, "reconfigure"):
        sys.stderr.reconfigure(encoding="utf-8", errors="replace")

    parser = argparse.ArgumentParser(description="Benchmark Arabic OCR engines on the same PDF.")
    parser.add_argument("pdf", type=Path, help="Arabic PDF to benchmark")
    parser.add_argument(
        "--engines",
        nargs="+",
        default=["easyocr", "paddleocr", "tesseract"],
        choices=[
            "arabic",
            "arabic-max",
            "qari-ocr",
            "tawkeed-ocr",
            "katib-ocr",
            "arabic-qwen-ocr",
            "arabic-glm-ocr",
            "baseer-ocr",
            "easyocr",
            "paddleocr",
            "paddleocr-vl",
            "surya",
            "tesseract",
            "tesseract-fast",
            "auto",
            "best",
        ],
    )
    parser.add_argument("--page-limit", type=int, default=1, help="Benchmark only the first N pages by default.")
    parser.add_argument("--json", action="store_true", help="Print full JSON results instead of a compact table.")
    args = parser.parse_args()

    if not args.pdf.exists():
        raise FileNotFoundError(f"PDF not found: {args.pdf}")
    if args.page_limit is not None and args.page_limit < 1:
        raise ValueError("--page-limit must be 1 or greater.")

    benchmark_pdf = make_limited_pdf(args.pdf, args.page_limit)
    try:
        results = [benchmark_engine(benchmark_pdf, engine) for engine in args.engines]
    finally:
        if benchmark_pdf != args.pdf:
            benchmark_pdf.unlink(missing_ok=True)

    if args.json:
        print(json.dumps(results, ensure_ascii=False, indent=2))
    else:
        print_table(results)
        print()
        print("Tip: use --json to inspect text previews and errors.")


if __name__ == "__main__":
    main_cli()