File size: 19,579 Bytes
2e1a095
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
from __future__ import annotations

import argparse
import json
import sys
from dataclasses import asdict
from pathlib import Path
from typing import Any

import fitz

ROOT_DIR = Path(__file__).resolve().parent.parent
if str(ROOT_DIR) not in sys.path:
    sys.path.insert(0, str(ROOT_DIR))

from scripts.select_test_pages import PageScore, select_pages


DEFAULT_OUT_DIR = ROOT_DIR / "outputs" / "external-ocr-sample"


def quote_path(path: Path) -> str:
    text = str(path)
    if any(char.isspace() for char in text):
        return f'"{text}"'
    return text


def render_page_images(pdf_path: Path, selected: list[PageScore], out_dir: Path, zoom: float = 2.0) -> list[dict[str, Any]]:
    if zoom <= 0:
        raise ValueError("zoom must be greater than 0")
    out_dir.mkdir(parents=True, exist_ok=True)
    images: list[dict[str, Any]] = []
    with fitz.open(pdf_path) as document:
        for item in selected:
            page = document[item.page - 1]
            pixmap = page.get_pixmap(matrix=fitz.Matrix(zoom, zoom), alpha=False)
            image_path = out_dir / f"page-{item.page:04d}.png"
            pixmap.save(image_path)
            images.append(
                {
                    "page": item.page,
                    "path": str(image_path),
                    "width": pixmap.width,
                    "height": pixmap.height,
                    "score": item.score,
                    "characters": item.characters,
                    "arabicWords": item.arabic_words,
                    "inkRatio": item.ink_ratio,
                }
            )
    return images


def build_external_ocr_commands(image_dir: Path) -> dict[str, str]:
    image_glob = quote_path(image_dir / "page-*.png")
    return {
        "wiredOcrBenchmark": (
            "python scripts\\benchmark_ocr.py C:\\path\\to\\book-best-5-pages.pdf "
            "--page-limit 5 --engines arabic-max arabic tawkeed-ocr baseer-ocr arabic-glm-ocr arabic-qwen-ocr katib-ocr qari-ocr paddleocr tesseract"
        ),
        "arabicGlmExternal": (
            f"If the Arabic-GLM sidecar is not installed, run Arabic-GLM-OCR-v2 externally against {image_glob}, "
            "then compare its cleaned Arabic text against QARI/KATIB and the wired OCR benchmark."
        ),
        "arabicQwen35External": (
            f"If the Arabic-Qwen sidecar is not installed, run Arabic-Qwen3.5-OCR-v4 externally against {image_glob}, "
            "then compare printed, handwritten, and diacritic-heavy Arabic output against the wired OCR benchmark."
        ),
        "loayQwen25External": (
            f"Run loay/Arabic-OCR-Qwen2.5-VL-7B-Vision externally against {image_glob} only on a strong worker, "
            "then compare its Arabic OCR output against QARI, KATIB, Arabic-Qwen3.5, Baseer, and the wired OCR benchmark."
        ),
        "dimiArabicOcrExternal": (
            f"Run DIMI Arabic OCR v2 externally against {image_glob} only on a strong worker, then compare printed Arabic, "
            "diacritics-heavy text, and formatting preservation against the wired OCR benchmark."
        ),
        "baseerExternal": (
            f"If the Baseer sidecar is not installed, run Baseer OCR externally against {image_glob}, then compare "
            "complex-layout Arabic output against the wired OCR benchmark."
        ),
        "atlasOcrExternal": (
            f"Run AtlasOCR externally against {image_glob} only for Darija/Moroccan Arabic PDFs, then compare "
            "against the wired OCR benchmark and confirm licensing before production wiring."
        ),
        "ketabaExternal": (
            f"Run Ketaba-OCR LoRA externally against {image_glob}, then compare its cleaned Arabic text "
            "against the wired OCR benchmark before adding a sidecar."
        ),
        "oiOcrExternal": (
            f"Run oi-OCR externally against {image_glob}, then compare structured Markdown/text extraction, "
            "Arabic reading order, and speech-readiness against the wired OCR benchmark."
        ),
        "nuExtract3External": (
            f"Run numind/NuExtract3 externally in document-to-Markdown or content mode against {image_glob}, "
            "then compare Arabic text preservation, layout cleanup, tables/forms, and speech-readiness against the wired OCR benchmark."
        ),
        "chandraExternal": (
            f"Run Chandra OCR 2 externally against {image_glob} for complex layouts, tables, forms, or mixed-language pages, "
            "then compare Arabic reading order and speech-readiness against the wired Arabic OCR benchmark before considering any hosted use."
        ),
        "dotsOcrExternal": (
            f"Run rednote-hilab/dots.ocr externally against {image_glob} for document layout, reading order, tables, formulas, "
            "or mixed-language pages, then compare Arabic word preservation and speech-readiness against the wired Arabic OCR benchmark."
        ),
        "olmocrArabicLoraExternal": (
            f"Run hastyle/olmOCR-arabic-lora-v2 externally against {image_glob} only for full-page Arabic manuscript scans "
            "on a large worker; compare it against Ketaba, QARI, line-cropped HAFITH/Glimpse, and the wired OCR benchmark."
        ),
        "arabicLargeNougatExternal": (
            f"Run MohamedRashad/arabic-large-nougat externally against {image_glob} for Arabic book-page OCR-to-Markdown, "
            "then compare text preservation, reading order, hallucination risk, and speech-readiness against the wired OCR benchmark."
        ),
        "doctrArabicExternal": (
            f"Run the DocTR Arabic FAST detector plus Arabic PARSEQ recognizer externally against {image_glob}, "
            "then compare classic OCR text ordering, Arabic word preservation, and recognizer license fit before any wiring."
        ),
        "krakenExternal": (
            f"Run Kraken/eScriptorium externally against {image_glob} with an Arabic-script recognition model or "
            "line-cropped workflow when pages look like historical print/manuscripts; then compare Arabic word "
            "preservation and reading order against the wired OCR benchmark before any sidecar work."
        ),
        "glmDocsExternal": (
            f"Run maloukafer/GLM-OCR-finetuned-documents externally against {image_glob} only for form-like, "
            "administrative, newspaper, or official-document PDFs; compare it against Arabic-GLM-OCR-v2 and the wired benchmark."
        ),
        "mimohaOcrExternal": (
            f"Run mimoha/ocr externally against {image_glob} only as a low-priority sparse-card check, then compare "
            "the resulting Arabic text with the same speech-readiness score."
        ),
        "handwritten4bitExternal": (
            f"Run sherif1313/Arabic-handwritten-OCR-4bit-Qwen2.5-VL-3B-v3 externally against {image_glob} when the "
            "PDF has handwriting or manuscript pages, then compare the smaller 4-bit output against handwritten-v3 and the wired OCR benchmark."
        ),
        "nakbaManuscriptLineExternal": (
            f"Run U4RASD/ar-ms-baseline externally only on line-cropped manuscript images from {image_glob}; keep it "
            "as a NAKBA 2026 manuscript-line benchmark unless a separate layout step crops pages into text lines."
        ),
        "hafithExternal": (
            f"Run mdnaseif/hafith externally only after cropping {image_glob} into text-line images; use it for "
            "historical Arabic manuscript or archival-print pages, then merge line outputs before scoring speech-readiness."
        ),
        "glimpseRtlExternal": (
            f"Run surfiniaburger/unsloth_finetune_ocr_arabic externally only after cropping {image_glob} into "
            "Arabic/Persian text-line images; compare the merged RTL line text against HAFITH, NAKBA line OCR, and the wired benchmark."
        ),
        "qwen25GgufExternal": (
            f"Run mo1998/arabic-ocr-qwen2.5-vl externally against {image_glob} as a QariOCR-trained GGUF/Unsloth "
            "benchmark, then compare scanned-book, religious-text, handwriting, and mixed Arabic-English output against QARI 0.4 and the wired OCR benchmark."
        ),
        "tawkeedExternal": (
            f"If the Tawkeed sidecar is not installed, run Tawkeed OCR externally against {image_glob}, then compare "
            "Arabic document, handwriting, and scene-text output against QARI 0.4, KATIB, Arabic-Qwen, Baseer, and the wired OCR benchmark."
        ),
        "falconExternal": (
            f"Run Falcon-OCR externally against {image_glob}, then compare Arabic word count, reading order, "
            "and speech-readiness against KATIB/QARI/PaddleOCR."
        ),
        "scoreExternalText": (
            "python scripts\\score_external_ocr.py "
            "--candidate arabic-glm=outputs\\external-ocr-sample\\arabic-glm.txt "
            "--candidate arabic-qwen35=outputs\\external-ocr-sample\\arabic-qwen35.txt "
            "--candidate loay-qwen25=outputs\\external-ocr-sample\\loay-qwen25.txt "
            "--candidate dimi-v2=outputs\\external-ocr-sample\\dimi-v2.txt "
            "--candidate atlasocr=outputs\\external-ocr-sample\\atlasocr.txt "
            "--candidate ketaba=outputs\\external-ocr-sample\\ketaba.txt "
            "--candidate oi-ocr=outputs\\external-ocr-sample\\oi-ocr.txt "
            "--candidate nuextract3=outputs\\external-ocr-sample\\nuextract3.txt "
            "--candidate chandra=outputs\\external-ocr-sample\\chandra.txt "
            "--candidate dots-ocr=outputs\\external-ocr-sample\\dots-ocr.txt "
            "--candidate olmocr-arabic-lora=outputs\\external-ocr-sample\\olmocr-arabic-lora.txt "
            "--candidate arabic-large-nougat=outputs\\external-ocr-sample\\arabic-large-nougat.txt "
            "--candidate doctr-arabic=outputs\\external-ocr-sample\\doctr-arabic.txt "
            "--candidate kraken=outputs\\external-ocr-sample\\kraken.txt "
            "--candidate glm-docs=outputs\\external-ocr-sample\\glm-docs.txt "
            "--candidate mimoha-ocr=outputs\\external-ocr-sample\\mimoha-ocr.txt "
            "--candidate handwritten-4bit=outputs\\external-ocr-sample\\handwritten-4bit.txt "
            "--candidate nakba-ms-line=outputs\\external-ocr-sample\\nakba-ms-line.txt "
            "--candidate hafith=outputs\\external-ocr-sample\\hafith.txt "
            "--candidate glimpse-rtl=outputs\\external-ocr-sample\\glimpse-rtl.txt "
            "--candidate qwen25-gguf=outputs\\external-ocr-sample\\qwen25-gguf.txt "
            "--candidate tawkeed=outputs\\external-ocr-sample\\tawkeed.txt "
            "--candidate falcon=outputs\\external-ocr-sample\\falcon.txt "
            "--candidate baseer=outputs\\external-ocr-sample\\baseer.txt "
            "--baseline-json outputs\\external-ocr-sample\\wired-ocr-baseline.json "
            "--write-report outputs\\external-ocr-sample\\external-ocr-score.md "
            "--write-json outputs\\external-ocr-sample\\external-ocr-score.json"
        ),
        "promotionGate": (
            "python scripts\\model_promotion_gate.py "
            "--candidate-name \"External OCR winner\" --kind ocr --license Apache-2.0 "
            "--score-json outputs\\external-ocr-sample\\external-ocr-score.json "
            "--same-sample --runtime-ok --privacy-ok --human-reviewed "
            "--write-report outputs\\external-ocr-sample\\model-promotion-gate.md"
        ),
    }


def write_ocr_sample_report(path: Path, result: dict[str, Any]) -> None:
    commands = result["commands"]
    lines = [
        "# External Arabic OCR Sample",
        "",
        f"PDF: {result['pdf']}",
        f"Image directory: {result['imageDir']}",
        f"Pages: {', '.join(str(image['page']) for image in result['images'])}",
        f"Render zoom: {result['zoom']}",
        "",
        "Use these exact page images for every external OCR model. Do not compare models on different pages or different render scales.",
        "",
        "## Images",
        "",
        "| Page | PNG | Size | Score | Arabic Words | Ink Ratio |",
        "| --- | --- | --- | --- | --- | --- |",
    ]
    for image in result["images"]:
        lines.append(
            f"| {image['page']} | {image['path']} | {image['width']}x{image['height']} | "
            f"{image['score']} | {image['arabicWords']} | {image['inkRatio']} |"
        )
    lines.extend(
        [
            "",
            "## Comparison Commands",
            "",
            "Wired OCR benchmark:",
            "",
            "```powershell",
            commands["wiredOcrBenchmark"],
            "```",
            "",
            "Arabic-GLM-OCR-v2:",
            "",
            "```text",
            commands["arabicGlmExternal"],
            "```",
            "",
            "Arabic-Qwen3.5-OCR-v4:",
            "",
            "```text",
            commands["arabicQwen35External"],
            "```",
            "",
            "Loay Arabic-OCR-Qwen2.5-VL-7B:",
            "",
            "```text",
            commands["loayQwen25External"],
            "```",
            "",
            "DIMI Arabic OCR v2:",
            "",
            "```text",
            commands["dimiArabicOcrExternal"],
            "```",
            "",
            "AtlasOCR:",
            "",
            "```text",
            commands["atlasOcrExternal"],
            "```",
            "",
            "Ketaba-OCR LoRA:",
            "",
            "```text",
            commands["ketabaExternal"],
            "```",
            "",
            "oi-OCR:",
            "",
            "```text",
            commands["oiOcrExternal"],
            "```",
            "",
            "NuExtract3:",
            "",
            "```text",
            commands["nuExtract3External"],
            "```",
            "",
            "Chandra OCR 2:",
            "",
            "```text",
            commands["chandraExternal"],
            "```",
            "",
            "dots.ocr:",
            "",
            "```text",
            commands["dotsOcrExternal"],
            "```",
            "",
            "olmOCR Arabic LoRA v2:",
            "",
            "```text",
            commands["olmocrArabicLoraExternal"],
            "```",
            "",
            "Arabic Large Nougat:",
            "",
            "```text",
            commands["arabicLargeNougatExternal"],
            "```",
            "",
            "DocTR Arabic FAST/PARSEQ:",
            "",
            "```text",
            commands["doctrArabicExternal"],
            "```",
            "",
            "Kraken/eScriptorium Arabic script:",
            "",
            "```text",
            commands["krakenExternal"],
            "```",
            "",
            "GLM-OCR Arabic/French documents:",
            "",
            "```text",
            commands["glmDocsExternal"],
            "```",
            "",
            "mimoha Arabic OCR:",
            "",
            "```text",
            commands["mimohaOcrExternal"],
            "```",
            "",
            "Arabic handwritten OCR 4-bit Qwen2.5-VL:",
            "",
            "```text",
            commands["handwritten4bitExternal"],
            "```",
            "",
            "NAKBA Arabic manuscript line OCR baseline:",
            "",
            "```text",
            commands["nakbaManuscriptLineExternal"],
            "```",
            "",
            "HAFITH:",
            "",
            "```text",
            commands["hafithExternal"],
            "```",
            "",
            "Glimpse RTL OCR:",
            "",
            "```text",
            commands["glimpseRtlExternal"],
            "```",
            "",
            "Arabic OCR Qwen2.5-VL GGUF:",
            "",
            "```text",
            commands["qwen25GgufExternal"],
            "```",
            "",
            "Tawkeed OCR:",
            "",
            "```text",
            commands["tawkeedExternal"],
            "```",
            "",
            "Falcon-OCR:",
            "",
            "```text",
            commands["falconExternal"],
            "```",
            "",
            "Baseer OCR:",
            "",
            "```text",
            commands["baseerExternal"],
            "```",
            "",
            "Score external OCR text outputs:",
            "",
            "```powershell",
            commands["scoreExternalText"],
            "```",
            "",
            "Promotion gate for the winning OCR candidate:",
            "",
            "```powershell",
            commands["promotionGate"],
            "```",
            "",
            "## Promotion Rule",
            "",
            "Replace the candidate name and license in the promotion-gate command with the real winning model. Promote an external OCR model only if it beats the wired Arabic OCR stack on these same pages, has an acceptable license, and the worker can handle its memory, cold start, and runtime.",
        ]
    )
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text("\n".join(lines).rstrip() + "\n", encoding="utf-8")


def export_ocr_sample_images(
    pdf_path: Path,
    out_dir: Path = DEFAULT_OUT_DIR,
    count: int = 5,
    skip_first: int = 0,
    zoom: float = 2.0,
) -> dict[str, Any]:
    if not pdf_path.exists():
        raise FileNotFoundError(f"PDF not found: {pdf_path}")
    if pdf_path.suffix.lower() != ".pdf":
        raise ValueError("Input must be a PDF file.")
    if count < 1:
        raise ValueError("count must be at least 1")

    selected = select_pages(pdf_path, count=count, skip_first=skip_first)
    image_dir = out_dir / "images"
    images = render_page_images(pdf_path, selected, image_dir, zoom=zoom)
    commands = build_external_ocr_commands(image_dir)
    result = {
        "pdf": str(pdf_path),
        "imageDir": str(image_dir),
        "reportPath": str(out_dir / "external-ocr-sample.md"),
        "zoom": zoom,
        "pages": [item.page for item in selected],
        "scores": [asdict(item) for item in selected],
        "images": images,
        "commands": commands,
    }
    write_ocr_sample_report(Path(result["reportPath"]), result)
    return result


def main_cli() -> None:
    if hasattr(sys.stdout, "reconfigure"):
        sys.stdout.reconfigure(encoding="utf-8", errors="replace")
    parser = argparse.ArgumentParser(description="Export selected Arabic PDF page images for external OCR benchmarking.")
    parser.add_argument("pdf", type=Path, help="Source Arabic PDF")
    parser.add_argument("--out-dir", type=Path, default=DEFAULT_OUT_DIR, help="Output directory")
    parser.add_argument("--count", type=int, default=5, help="Number of pages to export")
    parser.add_argument("--skip-first", type=int, default=0, help="Ignore the first N pages before scoring")
    parser.add_argument("--zoom", type=float, default=2.0, help="Render zoom for PNG images")
    parser.add_argument("--json", action="store_true", help="Print JSON details")
    args = parser.parse_args()

    result = export_ocr_sample_images(
        args.pdf,
        out_dir=args.out_dir,
        count=args.count,
        skip_first=args.skip_first,
        zoom=args.zoom,
    )
    if args.json:
        print(json.dumps(result, ensure_ascii=False, indent=2))
    else:
        print(f"Wrote OCR image sample report: {result['reportPath']}")
        print(f"Rendered pages: {', '.join(str(page) for page in result['pages'])}")


if __name__ == "__main__":
    main_cli()