File size: 19,579 Bytes
2e1a095 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 | from __future__ import annotations
import argparse
import json
import sys
from dataclasses import asdict
from pathlib import Path
from typing import Any
import fitz
ROOT_DIR = Path(__file__).resolve().parent.parent
if str(ROOT_DIR) not in sys.path:
sys.path.insert(0, str(ROOT_DIR))
from scripts.select_test_pages import PageScore, select_pages
DEFAULT_OUT_DIR = ROOT_DIR / "outputs" / "external-ocr-sample"
def quote_path(path: Path) -> str:
text = str(path)
if any(char.isspace() for char in text):
return f'"{text}"'
return text
def render_page_images(pdf_path: Path, selected: list[PageScore], out_dir: Path, zoom: float = 2.0) -> list[dict[str, Any]]:
if zoom <= 0:
raise ValueError("zoom must be greater than 0")
out_dir.mkdir(parents=True, exist_ok=True)
images: list[dict[str, Any]] = []
with fitz.open(pdf_path) as document:
for item in selected:
page = document[item.page - 1]
pixmap = page.get_pixmap(matrix=fitz.Matrix(zoom, zoom), alpha=False)
image_path = out_dir / f"page-{item.page:04d}.png"
pixmap.save(image_path)
images.append(
{
"page": item.page,
"path": str(image_path),
"width": pixmap.width,
"height": pixmap.height,
"score": item.score,
"characters": item.characters,
"arabicWords": item.arabic_words,
"inkRatio": item.ink_ratio,
}
)
return images
def build_external_ocr_commands(image_dir: Path) -> dict[str, str]:
image_glob = quote_path(image_dir / "page-*.png")
return {
"wiredOcrBenchmark": (
"python scripts\\benchmark_ocr.py C:\\path\\to\\book-best-5-pages.pdf "
"--page-limit 5 --engines arabic-max arabic tawkeed-ocr baseer-ocr arabic-glm-ocr arabic-qwen-ocr katib-ocr qari-ocr paddleocr tesseract"
),
"arabicGlmExternal": (
f"If the Arabic-GLM sidecar is not installed, run Arabic-GLM-OCR-v2 externally against {image_glob}, "
"then compare its cleaned Arabic text against QARI/KATIB and the wired OCR benchmark."
),
"arabicQwen35External": (
f"If the Arabic-Qwen sidecar is not installed, run Arabic-Qwen3.5-OCR-v4 externally against {image_glob}, "
"then compare printed, handwritten, and diacritic-heavy Arabic output against the wired OCR benchmark."
),
"loayQwen25External": (
f"Run loay/Arabic-OCR-Qwen2.5-VL-7B-Vision externally against {image_glob} only on a strong worker, "
"then compare its Arabic OCR output against QARI, KATIB, Arabic-Qwen3.5, Baseer, and the wired OCR benchmark."
),
"dimiArabicOcrExternal": (
f"Run DIMI Arabic OCR v2 externally against {image_glob} only on a strong worker, then compare printed Arabic, "
"diacritics-heavy text, and formatting preservation against the wired OCR benchmark."
),
"baseerExternal": (
f"If the Baseer sidecar is not installed, run Baseer OCR externally against {image_glob}, then compare "
"complex-layout Arabic output against the wired OCR benchmark."
),
"atlasOcrExternal": (
f"Run AtlasOCR externally against {image_glob} only for Darija/Moroccan Arabic PDFs, then compare "
"against the wired OCR benchmark and confirm licensing before production wiring."
),
"ketabaExternal": (
f"Run Ketaba-OCR LoRA externally against {image_glob}, then compare its cleaned Arabic text "
"against the wired OCR benchmark before adding a sidecar."
),
"oiOcrExternal": (
f"Run oi-OCR externally against {image_glob}, then compare structured Markdown/text extraction, "
"Arabic reading order, and speech-readiness against the wired OCR benchmark."
),
"nuExtract3External": (
f"Run numind/NuExtract3 externally in document-to-Markdown or content mode against {image_glob}, "
"then compare Arabic text preservation, layout cleanup, tables/forms, and speech-readiness against the wired OCR benchmark."
),
"chandraExternal": (
f"Run Chandra OCR 2 externally against {image_glob} for complex layouts, tables, forms, or mixed-language pages, "
"then compare Arabic reading order and speech-readiness against the wired Arabic OCR benchmark before considering any hosted use."
),
"dotsOcrExternal": (
f"Run rednote-hilab/dots.ocr externally against {image_glob} for document layout, reading order, tables, formulas, "
"or mixed-language pages, then compare Arabic word preservation and speech-readiness against the wired Arabic OCR benchmark."
),
"olmocrArabicLoraExternal": (
f"Run hastyle/olmOCR-arabic-lora-v2 externally against {image_glob} only for full-page Arabic manuscript scans "
"on a large worker; compare it against Ketaba, QARI, line-cropped HAFITH/Glimpse, and the wired OCR benchmark."
),
"arabicLargeNougatExternal": (
f"Run MohamedRashad/arabic-large-nougat externally against {image_glob} for Arabic book-page OCR-to-Markdown, "
"then compare text preservation, reading order, hallucination risk, and speech-readiness against the wired OCR benchmark."
),
"doctrArabicExternal": (
f"Run the DocTR Arabic FAST detector plus Arabic PARSEQ recognizer externally against {image_glob}, "
"then compare classic OCR text ordering, Arabic word preservation, and recognizer license fit before any wiring."
),
"krakenExternal": (
f"Run Kraken/eScriptorium externally against {image_glob} with an Arabic-script recognition model or "
"line-cropped workflow when pages look like historical print/manuscripts; then compare Arabic word "
"preservation and reading order against the wired OCR benchmark before any sidecar work."
),
"glmDocsExternal": (
f"Run maloukafer/GLM-OCR-finetuned-documents externally against {image_glob} only for form-like, "
"administrative, newspaper, or official-document PDFs; compare it against Arabic-GLM-OCR-v2 and the wired benchmark."
),
"mimohaOcrExternal": (
f"Run mimoha/ocr externally against {image_glob} only as a low-priority sparse-card check, then compare "
"the resulting Arabic text with the same speech-readiness score."
),
"handwritten4bitExternal": (
f"Run sherif1313/Arabic-handwritten-OCR-4bit-Qwen2.5-VL-3B-v3 externally against {image_glob} when the "
"PDF has handwriting or manuscript pages, then compare the smaller 4-bit output against handwritten-v3 and the wired OCR benchmark."
),
"nakbaManuscriptLineExternal": (
f"Run U4RASD/ar-ms-baseline externally only on line-cropped manuscript images from {image_glob}; keep it "
"as a NAKBA 2026 manuscript-line benchmark unless a separate layout step crops pages into text lines."
),
"hafithExternal": (
f"Run mdnaseif/hafith externally only after cropping {image_glob} into text-line images; use it for "
"historical Arabic manuscript or archival-print pages, then merge line outputs before scoring speech-readiness."
),
"glimpseRtlExternal": (
f"Run surfiniaburger/unsloth_finetune_ocr_arabic externally only after cropping {image_glob} into "
"Arabic/Persian text-line images; compare the merged RTL line text against HAFITH, NAKBA line OCR, and the wired benchmark."
),
"qwen25GgufExternal": (
f"Run mo1998/arabic-ocr-qwen2.5-vl externally against {image_glob} as a QariOCR-trained GGUF/Unsloth "
"benchmark, then compare scanned-book, religious-text, handwriting, and mixed Arabic-English output against QARI 0.4 and the wired OCR benchmark."
),
"tawkeedExternal": (
f"If the Tawkeed sidecar is not installed, run Tawkeed OCR externally against {image_glob}, then compare "
"Arabic document, handwriting, and scene-text output against QARI 0.4, KATIB, Arabic-Qwen, Baseer, and the wired OCR benchmark."
),
"falconExternal": (
f"Run Falcon-OCR externally against {image_glob}, then compare Arabic word count, reading order, "
"and speech-readiness against KATIB/QARI/PaddleOCR."
),
"scoreExternalText": (
"python scripts\\score_external_ocr.py "
"--candidate arabic-glm=outputs\\external-ocr-sample\\arabic-glm.txt "
"--candidate arabic-qwen35=outputs\\external-ocr-sample\\arabic-qwen35.txt "
"--candidate loay-qwen25=outputs\\external-ocr-sample\\loay-qwen25.txt "
"--candidate dimi-v2=outputs\\external-ocr-sample\\dimi-v2.txt "
"--candidate atlasocr=outputs\\external-ocr-sample\\atlasocr.txt "
"--candidate ketaba=outputs\\external-ocr-sample\\ketaba.txt "
"--candidate oi-ocr=outputs\\external-ocr-sample\\oi-ocr.txt "
"--candidate nuextract3=outputs\\external-ocr-sample\\nuextract3.txt "
"--candidate chandra=outputs\\external-ocr-sample\\chandra.txt "
"--candidate dots-ocr=outputs\\external-ocr-sample\\dots-ocr.txt "
"--candidate olmocr-arabic-lora=outputs\\external-ocr-sample\\olmocr-arabic-lora.txt "
"--candidate arabic-large-nougat=outputs\\external-ocr-sample\\arabic-large-nougat.txt "
"--candidate doctr-arabic=outputs\\external-ocr-sample\\doctr-arabic.txt "
"--candidate kraken=outputs\\external-ocr-sample\\kraken.txt "
"--candidate glm-docs=outputs\\external-ocr-sample\\glm-docs.txt "
"--candidate mimoha-ocr=outputs\\external-ocr-sample\\mimoha-ocr.txt "
"--candidate handwritten-4bit=outputs\\external-ocr-sample\\handwritten-4bit.txt "
"--candidate nakba-ms-line=outputs\\external-ocr-sample\\nakba-ms-line.txt "
"--candidate hafith=outputs\\external-ocr-sample\\hafith.txt "
"--candidate glimpse-rtl=outputs\\external-ocr-sample\\glimpse-rtl.txt "
"--candidate qwen25-gguf=outputs\\external-ocr-sample\\qwen25-gguf.txt "
"--candidate tawkeed=outputs\\external-ocr-sample\\tawkeed.txt "
"--candidate falcon=outputs\\external-ocr-sample\\falcon.txt "
"--candidate baseer=outputs\\external-ocr-sample\\baseer.txt "
"--baseline-json outputs\\external-ocr-sample\\wired-ocr-baseline.json "
"--write-report outputs\\external-ocr-sample\\external-ocr-score.md "
"--write-json outputs\\external-ocr-sample\\external-ocr-score.json"
),
"promotionGate": (
"python scripts\\model_promotion_gate.py "
"--candidate-name \"External OCR winner\" --kind ocr --license Apache-2.0 "
"--score-json outputs\\external-ocr-sample\\external-ocr-score.json "
"--same-sample --runtime-ok --privacy-ok --human-reviewed "
"--write-report outputs\\external-ocr-sample\\model-promotion-gate.md"
),
}
def write_ocr_sample_report(path: Path, result: dict[str, Any]) -> None:
commands = result["commands"]
lines = [
"# External Arabic OCR Sample",
"",
f"PDF: {result['pdf']}",
f"Image directory: {result['imageDir']}",
f"Pages: {', '.join(str(image['page']) for image in result['images'])}",
f"Render zoom: {result['zoom']}",
"",
"Use these exact page images for every external OCR model. Do not compare models on different pages or different render scales.",
"",
"## Images",
"",
"| Page | PNG | Size | Score | Arabic Words | Ink Ratio |",
"| --- | --- | --- | --- | --- | --- |",
]
for image in result["images"]:
lines.append(
f"| {image['page']} | {image['path']} | {image['width']}x{image['height']} | "
f"{image['score']} | {image['arabicWords']} | {image['inkRatio']} |"
)
lines.extend(
[
"",
"## Comparison Commands",
"",
"Wired OCR benchmark:",
"",
"```powershell",
commands["wiredOcrBenchmark"],
"```",
"",
"Arabic-GLM-OCR-v2:",
"",
"```text",
commands["arabicGlmExternal"],
"```",
"",
"Arabic-Qwen3.5-OCR-v4:",
"",
"```text",
commands["arabicQwen35External"],
"```",
"",
"Loay Arabic-OCR-Qwen2.5-VL-7B:",
"",
"```text",
commands["loayQwen25External"],
"```",
"",
"DIMI Arabic OCR v2:",
"",
"```text",
commands["dimiArabicOcrExternal"],
"```",
"",
"AtlasOCR:",
"",
"```text",
commands["atlasOcrExternal"],
"```",
"",
"Ketaba-OCR LoRA:",
"",
"```text",
commands["ketabaExternal"],
"```",
"",
"oi-OCR:",
"",
"```text",
commands["oiOcrExternal"],
"```",
"",
"NuExtract3:",
"",
"```text",
commands["nuExtract3External"],
"```",
"",
"Chandra OCR 2:",
"",
"```text",
commands["chandraExternal"],
"```",
"",
"dots.ocr:",
"",
"```text",
commands["dotsOcrExternal"],
"```",
"",
"olmOCR Arabic LoRA v2:",
"",
"```text",
commands["olmocrArabicLoraExternal"],
"```",
"",
"Arabic Large Nougat:",
"",
"```text",
commands["arabicLargeNougatExternal"],
"```",
"",
"DocTR Arabic FAST/PARSEQ:",
"",
"```text",
commands["doctrArabicExternal"],
"```",
"",
"Kraken/eScriptorium Arabic script:",
"",
"```text",
commands["krakenExternal"],
"```",
"",
"GLM-OCR Arabic/French documents:",
"",
"```text",
commands["glmDocsExternal"],
"```",
"",
"mimoha Arabic OCR:",
"",
"```text",
commands["mimohaOcrExternal"],
"```",
"",
"Arabic handwritten OCR 4-bit Qwen2.5-VL:",
"",
"```text",
commands["handwritten4bitExternal"],
"```",
"",
"NAKBA Arabic manuscript line OCR baseline:",
"",
"```text",
commands["nakbaManuscriptLineExternal"],
"```",
"",
"HAFITH:",
"",
"```text",
commands["hafithExternal"],
"```",
"",
"Glimpse RTL OCR:",
"",
"```text",
commands["glimpseRtlExternal"],
"```",
"",
"Arabic OCR Qwen2.5-VL GGUF:",
"",
"```text",
commands["qwen25GgufExternal"],
"```",
"",
"Tawkeed OCR:",
"",
"```text",
commands["tawkeedExternal"],
"```",
"",
"Falcon-OCR:",
"",
"```text",
commands["falconExternal"],
"```",
"",
"Baseer OCR:",
"",
"```text",
commands["baseerExternal"],
"```",
"",
"Score external OCR text outputs:",
"",
"```powershell",
commands["scoreExternalText"],
"```",
"",
"Promotion gate for the winning OCR candidate:",
"",
"```powershell",
commands["promotionGate"],
"```",
"",
"## Promotion Rule",
"",
"Replace the candidate name and license in the promotion-gate command with the real winning model. Promote an external OCR model only if it beats the wired Arabic OCR stack on these same pages, has an acceptable license, and the worker can handle its memory, cold start, and runtime.",
]
)
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text("\n".join(lines).rstrip() + "\n", encoding="utf-8")
def export_ocr_sample_images(
pdf_path: Path,
out_dir: Path = DEFAULT_OUT_DIR,
count: int = 5,
skip_first: int = 0,
zoom: float = 2.0,
) -> dict[str, Any]:
if not pdf_path.exists():
raise FileNotFoundError(f"PDF not found: {pdf_path}")
if pdf_path.suffix.lower() != ".pdf":
raise ValueError("Input must be a PDF file.")
if count < 1:
raise ValueError("count must be at least 1")
selected = select_pages(pdf_path, count=count, skip_first=skip_first)
image_dir = out_dir / "images"
images = render_page_images(pdf_path, selected, image_dir, zoom=zoom)
commands = build_external_ocr_commands(image_dir)
result = {
"pdf": str(pdf_path),
"imageDir": str(image_dir),
"reportPath": str(out_dir / "external-ocr-sample.md"),
"zoom": zoom,
"pages": [item.page for item in selected],
"scores": [asdict(item) for item in selected],
"images": images,
"commands": commands,
}
write_ocr_sample_report(Path(result["reportPath"]), result)
return result
def main_cli() -> None:
if hasattr(sys.stdout, "reconfigure"):
sys.stdout.reconfigure(encoding="utf-8", errors="replace")
parser = argparse.ArgumentParser(description="Export selected Arabic PDF page images for external OCR benchmarking.")
parser.add_argument("pdf", type=Path, help="Source Arabic PDF")
parser.add_argument("--out-dir", type=Path, default=DEFAULT_OUT_DIR, help="Output directory")
parser.add_argument("--count", type=int, default=5, help="Number of pages to export")
parser.add_argument("--skip-first", type=int, default=0, help="Ignore the first N pages before scoring")
parser.add_argument("--zoom", type=float, default=2.0, help="Render zoom for PNG images")
parser.add_argument("--json", action="store_true", help="Print JSON details")
args = parser.parse_args()
result = export_ocr_sample_images(
args.pdf,
out_dir=args.out_dir,
count=args.count,
skip_first=args.skip_first,
zoom=args.zoom,
)
if args.json:
print(json.dumps(result, ensure_ascii=False, indent=2))
else:
print(f"Wrote OCR image sample report: {result['reportPath']}")
print(f"Rendered pages: {', '.join(str(page) for page in result['pages'])}")
if __name__ == "__main__":
main_cli()
|