Spaces:

Syncre
/

arabic-audio-reader-worker

Running

File size: 19,579 Bytes

2e1a095

from __future__ import annotations

import argparse
import json
import sys
from dataclasses import asdict
from pathlib import Path
from typing import Any

import fitz

ROOT_DIR = Path(__file__).resolve().parent.parent
if str(ROOT_DIR) not in sys.path:
    sys.path.insert(0, str(ROOT_DIR))

from scripts.select_test_pages import PageScore, select_pages


DEFAULT_OUT_DIR = ROOT_DIR / "outputs" / "external-ocr-sample"


def quote_path(path: Path) -> str:
    text = str(path)
    if any(char.isspace() for char in text):
        return f'"{text}"'
    return text


def render_page_images(pdf_path: Path, selected: list[PageScore], out_dir: Path, zoom: float = 2.0) -> list[dict[str, Any]]:
    if zoom <= 0:
        raise ValueError("zoom must be greater than 0")
    out_dir.mkdir(parents=True, exist_ok=True)
    images: list[dict[str, Any]] = []
    with fitz.open(pdf_path) as document:
        for item in selected:
            page = document[item.page - 1]
            pixmap = page.get_pixmap(matrix=fitz.Matrix(zoom, zoom), alpha=False)
            image_path = out_dir / f"page-{item.page:04d}.png"
            pixmap.save(image_path)
            images.append(
                {
                    "page": item.page,
                    "path": str(image_path),
                    "width": pixmap.width,
                    "height": pixmap.height,
                    "score": item.score,
                    "characters": item.characters,
                    "arabicWords": item.arabic_words,
                    "inkRatio": item.ink_ratio,
                }
            )
    return images


def build_external_ocr_commands(image_dir: Path) -> dict[str, str]:
    image_glob = quote_path(image_dir / "page-*.png")
    return {
        "wiredOcrBenchmark": (
            "python scripts\\benchmark_ocr.py C:\\path\\to\\book-best-5-pages.pdf "
            "--page-limit 5 --engines arabic-max arabic tawkeed-ocr baseer-ocr arabic-glm-ocr arabic-qwen-ocr katib-ocr qari-ocr paddleocr tesseract"
        ),
        "arabicGlmExternal": (
            f"If the Arabic-GLM sidecar is not installed, run Arabic-GLM-OCR-v2 externally against {image_glob}, "
            "then compare its cleaned Arabic text against QARI/KATIB and the wired OCR benchmark."
        ),
        "arabicQwen35External": (
            f"If the Arabic-Qwen sidecar is not installed, run Arabic-Qwen3.5-OCR-v4 externally against {image_glob}, "
            "then compare printed, handwritten, and diacritic-heavy Arabic output against the wired OCR benchmark."
        ),
        "loayQwen25External": (
            f"Run loay/Arabic-OCR-Qwen2.5-VL-7B-Vision externally against {image_glob} only on a strong worker, "
            "then compare its Arabic OCR output against QARI, KATIB, Arabic-Qwen3.5, Baseer, and the wired OCR benchmark."
        ),
        "dimiArabicOcrExternal": (
            f"Run DIMI Arabic OCR v2 externally against {image_glob} only on a strong worker, then compare printed Arabic, "
            "diacritics-heavy text, and formatting preservation against the wired OCR benchmark."
        ),
        "baseerExternal": (
            f"If the Baseer sidecar is not installed, run Baseer OCR externally against {image_glob}, then compare "
            "complex-layout Arabic output against the wired OCR benchmark."
        ),
        "atlasOcrExternal": (
            f"Run AtlasOCR externally against {image_glob} only for Darija/Moroccan Arabic PDFs, then compare "
            "against the wired OCR benchmark and confirm licensing before production wiring."
        ),
        "ketabaExternal": (
            f"Run Ketaba-OCR LoRA externally against {image_glob}, then compare its cleaned Arabic text "
            "against the wired OCR benchmark before adding a sidecar."
        ),
        "oiOcrExternal": (
            f"Run oi-OCR externally against {image_glob}, then compare structured Markdown/text extraction, "
            "Arabic reading order, and speech-readiness against the wired OCR benchmark."
        ),
        "nuExtract3External": (
            f"Run numind/NuExtract3 externally in document-to-Markdown or content mode against {image_glob}, "
            "then compare Arabic text preservation, layout cleanup, tables/forms, and speech-readiness against the wired OCR benchmark."
        ),
        "chandraExternal": (
            f"Run Chandra OCR 2 externally against {image_glob} for complex layouts, tables, forms, or mixed-language pages, "
            "then compare Arabic reading order and speech-readiness against the wired Arabic OCR benchmark before considering any hosted use."
        ),
        "dotsOcrExternal": (
            f"Run rednote-hilab/dots.ocr externally against {image_glob} for document layout, reading order, tables, formulas, "
            "or mixed-language pages, then compare Arabic word preservation and speech-readiness against the wired Arabic OCR benchmark."
        ),
        "olmocrArabicLoraExternal": (
            f"Run hastyle/olmOCR-arabic-lora-v2 externally against {image_glob} only for full-page Arabic manuscript scans "
            "on a large worker; compare it against Ketaba, QARI, line-cropped HAFITH/Glimpse, and the wired OCR benchmark."
        ),
        "arabicLargeNougatExternal": (
            f"Run MohamedRashad/arabic-large-nougat externally against {image_glob} for Arabic book-page OCR-to-Markdown, "
            "then compare text preservation, reading order, hallucination risk, and speech-readiness against the wired OCR benchmark."
        ),
        "doctrArabicExternal": (
            f"Run the DocTR Arabic FAST detector plus Arabic PARSEQ recognizer externally against {image_glob}, "
            "then compare classic OCR text ordering, Arabic word preservation, and recognizer license fit before any wiring."
        ),
        "krakenExternal": (
            f"Run Kraken/eScriptorium externally against {image_glob} with an Arabic-script recognition model or "
            "line-cropped workflow when pages look like historical print/manuscripts; then compare Arabic word "
            "preservation and reading order against the wired OCR benchmark before any sidecar work."
        ),
        "glmDocsExternal": (
            f"Run maloukafer/GLM-OCR-finetuned-documents externally against {image_glob} only for form-like, "
            "administrative, newspaper, or official-document PDFs; compare it against Arabic-GLM-OCR-v2 and the wired benchmark."
        ),
        "mimohaOcrExternal": (
            f"Run mimoha/ocr externally against {image_glob} only as a low-priority sparse-card check, then compare "
            "the resulting Arabic text with the same speech-readiness score."
        ),
        "handwritten4bitExternal": (
            f"Run sherif1313/Arabic-handwritten-OCR-4bit-Qwen2.5-VL-3B-v3 externally against {image_glob} when the "
            "PDF has handwriting or manuscript pages, then compare the smaller 4-bit output against handwritten-v3 and the wired OCR benchmark."
        ),
        "nakbaManuscriptLineExternal": (
            f"Run U4RASD/ar-ms-baseline externally only on line-cropped manuscript images from {image_glob}; keep it "
            "as a NAKBA 2026 manuscript-line benchmark unless a separate layout step crops pages into text lines."
        ),
        "hafithExternal": (
            f"Run mdnaseif/hafith externally only after cropping {image_glob} into text-line images; use it for "
            "historical Arabic manuscript or archival-print pages, then merge line outputs before scoring speech-readiness."
        ),
        "glimpseRtlExternal": (
            f"Run surfiniaburger/unsloth_finetune_ocr_arabic externally only after cropping {image_glob} into "
            "Arabic/Persian text-line images; compare the merged RTL line text against HAFITH, NAKBA line OCR, and the wired benchmark."
        ),
        "qwen25GgufExternal": (
            f"Run mo1998/arabic-ocr-qwen2.5-vl externally against {image_glob} as a QariOCR-trained GGUF/Unsloth "
            "benchmark, then compare scanned-book, religious-text, handwriting, and mixed Arabic-English output against QARI 0.4 and the wired OCR benchmark."
        ),
        "tawkeedExternal": (
            f"If the Tawkeed sidecar is not installed, run Tawkeed OCR externally against {image_glob}, then compare "
            "Arabic document, handwriting, and scene-text output against QARI 0.4, KATIB, Arabic-Qwen, Baseer, and the wired OCR benchmark."
        ),
        "falconExternal": (
            f"Run Falcon-OCR externally against {image_glob}, then compare Arabic word count, reading order, "
            "and speech-readiness against KATIB/QARI/PaddleOCR."
        ),
        "scoreExternalText": (
            "python scripts\\score_external_ocr.py "
            "--candidate arabic-glm=outputs\\external-ocr-sample\\arabic-glm.txt "
            "--candidate arabic-qwen35=outputs\\external-ocr-sample\\arabic-qwen35.txt "
            "--candidate loay-qwen25=outputs\\external-ocr-sample\\loay-qwen25.txt "
            "--candidate dimi-v2=outputs\\external-ocr-sample\\dimi-v2.txt "
            "--candidate atlasocr=outputs\\external-ocr-sample\\atlasocr.txt "
            "--candidate ketaba=outputs\\external-ocr-sample\\ketaba.txt "
            "--candidate oi-ocr=outputs\\external-ocr-sample\\oi-ocr.txt "
            "--candidate nuextract3=outputs\\external-ocr-sample\\nuextract3.txt "
            "--candidate chandra=outputs\\external-ocr-sample\\chandra.txt "
            "--candidate dots-ocr=outputs\\external-ocr-sample\\dots-ocr.txt "
            "--candidate olmocr-arabic-lora=outputs\\external-ocr-sample\\olmocr-arabic-lora.txt "
            "--candidate arabic-large-nougat=outputs\\external-ocr-sample\\arabic-large-nougat.txt "
            "--candidate doctr-arabic=outputs\\external-ocr-sample\\doctr-arabic.txt "
            "--candidate kraken=outputs\\external-ocr-sample\\kraken.txt "
            "--candidate glm-docs=outputs\\external-ocr-sample\\glm-docs.txt "
            "--candidate mimoha-ocr=outputs\\external-ocr-sample\\mimoha-ocr.txt "
            "--candidate handwritten-4bit=outputs\\external-ocr-sample\\handwritten-4bit.txt "
            "--candidate nakba-ms-line=outputs\\external-ocr-sample\\nakba-ms-line.txt "
            "--candidate hafith=outputs\\external-ocr-sample\\hafith.txt "
            "--candidate glimpse-rtl=outputs\\external-ocr-sample\\glimpse-rtl.txt "
            "--candidate qwen25-gguf=outputs\\external-ocr-sample\\qwen25-gguf.txt "
            "--candidate tawkeed=outputs\\external-ocr-sample\\tawkeed.txt "
            "--candidate falcon=outputs\\external-ocr-sample\\falcon.txt "
            "--candidate baseer=outputs\\external-ocr-sample\\baseer.txt "
            "--baseline-json outputs\\external-ocr-sample\\wired-ocr-baseline.json "
            "--write-report outputs\\external-ocr-sample\\external-ocr-score.md "
            "--write-json outputs\\external-ocr-sample\\external-ocr-score.json"
        ),
        "promotionGate": (
            "python scripts\\model_promotion_gate.py "
            "--candidate-name \"External OCR winner\" --kind ocr --license Apache-2.0 "
            "--score-json outputs\\external-ocr-sample\\external-ocr-score.json "
            "--same-sample --runtime-ok --privacy-ok --human-reviewed "
            "--write-report outputs\\external-ocr-sample\\model-promotion-gate.md"
        ),
    }


def write_ocr_sample_report(path: Path, result: dict[str, Any]) -> None:
    commands = result["commands"]
    lines = [
        "# External Arabic OCR Sample",
        "",
        f"PDF: {result['pdf']}",
        f"Image directory: {result['imageDir']}",
        f"Pages: {', '.join(str(image['page']) for image in result['images'])}",
        f"Render zoom: {result['zoom']}",
        "",
        "Use these exact page images for every external OCR model. Do not compare models on different pages or different render scales.",
        "",
        "## Images",
        "",
        "| Page | PNG | Size | Score | Arabic Words | Ink Ratio |",
        "| --- | --- | --- | --- | --- | --- |",
    ]
    for image in result["images"]:
        lines.append(
            f"| {image['page']} | {image['path']} | {image['width']}x{image['height']} | "
            f"{image['score']} | {image['arabicWords']} | {image['inkRatio']} |"
        )
    lines.extend(
        [
            "",
            "## Comparison Commands",
            "",
            "Wired OCR benchmark:",
            "",
            "```powershell",
            commands["wiredOcrBenchmark"],
            "```",
            "",
            "Arabic-GLM-OCR-v2:",
            "",
            "```text",
            commands["arabicGlmExternal"],
            "```",
            "",
            "Arabic-Qwen3.5-OCR-v4:",
            "",
            "```text",
            commands["arabicQwen35External"],
            "```",
            "",
            "Loay Arabic-OCR-Qwen2.5-VL-7B:",
            "",
            "```text",
            commands["loayQwen25External"],
            "```",
            "",
            "DIMI Arabic OCR v2:",
            "",
            "```text",
            commands["dimiArabicOcrExternal"],
            "```",
            "",
            "AtlasOCR:",
            "",
            "```text",
            commands["atlasOcrExternal"],
            "```",
            "",
            "Ketaba-OCR LoRA:",
            "",
            "```text",
            commands["ketabaExternal"],
            "```",
            "",
            "oi-OCR:",
            "",
            "```text",
            commands["oiOcrExternal"],
            "```",
            "",
            "NuExtract3:",
            "",
            "```text",
            commands["nuExtract3External"],
            "```",
            "",
            "Chandra OCR 2:",
            "",
            "```text",
            commands["chandraExternal"],
            "```",
            "",
            "dots.ocr:",
            "",
            "```text",
            commands["dotsOcrExternal"],
            "```",
            "",
            "olmOCR Arabic LoRA v2:",
            "",
            "```text",
            commands["olmocrArabicLoraExternal"],
            "```",
            "",
            "Arabic Large Nougat:",
            "",
            "```text",
            commands["arabicLargeNougatExternal"],
            "```",
            "",
            "DocTR Arabic FAST/PARSEQ:",
            "",
            "```text",
            commands["doctrArabicExternal"],
            "```",
            "",
            "Kraken/eScriptorium Arabic script:",
            "",
            "```text",
            commands["krakenExternal"],
            "```",
            "",
            "GLM-OCR Arabic/French documents:",
            "",
            "```text",
            commands["glmDocsExternal"],
            "```",
            "",
            "mimoha Arabic OCR:",
            "",
            "```text",
            commands["mimohaOcrExternal"],
            "```",
            "",
            "Arabic handwritten OCR 4-bit Qwen2.5-VL:",
            "",
            "```text",
            commands["handwritten4bitExternal"],
            "```",
            "",
            "NAKBA Arabic manuscript line OCR baseline:",
            "",
            "```text",
            commands["nakbaManuscriptLineExternal"],
            "```",
            "",
            "HAFITH:",
            "",
            "```text",
            commands["hafithExternal"],
            "```",
            "",
            "Glimpse RTL OCR:",
            "",
            "```text",
            commands["glimpseRtlExternal"],
            "```",
            "",
            "Arabic OCR Qwen2.5-VL GGUF:",
            "",
            "```text",
            commands["qwen25GgufExternal"],
            "```",
            "",
            "Tawkeed OCR:",
            "",
            "```text",
            commands["tawkeedExternal"],
            "```",
            "",
            "Falcon-OCR:",
            "",
            "```text",
            commands["falconExternal"],
            "```",
            "",
            "Baseer OCR:",
            "",
            "```text",
            commands["baseerExternal"],
            "```",
            "",
            "Score external OCR text outputs:",
            "",
            "```powershell",
            commands["scoreExternalText"],
            "```",
            "",
            "Promotion gate for the winning OCR candidate:",
            "",
            "```powershell",
            commands["promotionGate"],
            "```",
            "",
            "## Promotion Rule",
            "",
            "Replace the candidate name and license in the promotion-gate command with the real winning model. Promote an external OCR model only if it beats the wired Arabic OCR stack on these same pages, has an acceptable license, and the worker can handle its memory, cold start, and runtime.",
        ]
    )
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text("\n".join(lines).rstrip() + "\n", encoding="utf-8")


def export_ocr_sample_images(
    pdf_path: Path,
    out_dir: Path = DEFAULT_OUT_DIR,
    count: int = 5,
    skip_first: int = 0,
    zoom: float = 2.0,
) -> dict[str, Any]:
    if not pdf_path.exists():
        raise FileNotFoundError(f"PDF not found: {pdf_path}")
    if pdf_path.suffix.lower() != ".pdf":
        raise ValueError("Input must be a PDF file.")
    if count < 1:
        raise ValueError("count must be at least 1")

    selected = select_pages(pdf_path, count=count, skip_first=skip_first)
    image_dir = out_dir / "images"
    images = render_page_images(pdf_path, selected, image_dir, zoom=zoom)
    commands = build_external_ocr_commands(image_dir)
    result = {
        "pdf": str(pdf_path),
        "imageDir": str(image_dir),
        "reportPath": str(out_dir / "external-ocr-sample.md"),
        "zoom": zoom,
        "pages": [item.page for item in selected],
        "scores": [asdict(item) for item in selected],
        "images": images,
        "commands": commands,
    }
    write_ocr_sample_report(Path(result["reportPath"]), result)
    return result


def main_cli() -> None:
    if hasattr(sys.stdout, "reconfigure"):
        sys.stdout.reconfigure(encoding="utf-8", errors="replace")
    parser = argparse.ArgumentParser(description="Export selected Arabic PDF page images for external OCR benchmarking.")
    parser.add_argument("pdf", type=Path, help="Source Arabic PDF")
    parser.add_argument("--out-dir", type=Path, default=DEFAULT_OUT_DIR, help="Output directory")
    parser.add_argument("--count", type=int, default=5, help="Number of pages to export")
    parser.add_argument("--skip-first", type=int, default=0, help="Ignore the first N pages before scoring")
    parser.add_argument("--zoom", type=float, default=2.0, help="Render zoom for PNG images")
    parser.add_argument("--json", action="store_true", help="Print JSON details")
    args = parser.parse_args()

    result = export_ocr_sample_images(
        args.pdf,
        out_dir=args.out_dir,
        count=args.count,
        skip_first=args.skip_first,
        zoom=args.zoom,
    )
    if args.json:
        print(json.dumps(result, ensure_ascii=False, indent=2))
    else:
        print(f"Wrote OCR image sample report: {result['reportPath']}")
        print(f"Rendered pages: {', '.join(str(page) for page in result['pages'])}")


if __name__ == "__main__":
    main_cli()