File size: 11,610 Bytes
2e1a095
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
985cdbe
 
 
 
 
 
 
 
 
 
2e1a095
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
985cdbe
 
 
 
 
2e1a095
 
 
 
 
 
 
 
 
985cdbe
 
 
 
 
 
 
 
 
 
 
 
2e1a095
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
985cdbe
2e1a095
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
from __future__ import annotations

import argparse
import json
import re
import sys
import tempfile
import time
from pathlib import Path
from typing import Any

import fitz

ROOT_DIR = Path(__file__).resolve().parent.parent
if str(ROOT_DIR) not in sys.path:
    sys.path.insert(0, str(ROOT_DIR))

from app import main


EXTRACTION_RE = re.compile(r"^(?:(?P<mode>best|arabic|arabic-max):)?(?P<engine>[a-z-]+)(?:@(?P<zoom>[0-9.]+)x)?(?:-psm(?P<psm>\d+))?$")


def make_limited_pdf(pdf_path: Path, page_limit: int | None) -> Path:
    if not page_limit:
        return pdf_path
    limited = Path(tempfile.gettempdir()) / f"{pdf_path.stem}-first-{page_limit}-pages.pdf"
    with fitz.open(pdf_path) as document:
        output = fitz.open()
        output.insert_pdf(document, from_page=0, to_page=min(page_limit, document.page_count) - 1)
        output.save(limited)
    return limited


def text_metrics(text: str) -> dict[str, Any]:
    speech_text = main.prepare_text_for_speech(text)
    arabic_words = main.ARABIC_RE.findall(speech_text)
    placeholder_count = speech_text.count("?") + speech_text.count("\ufffd")
    common_hits = sum(1 for word in arabic_words if word in main.COMMON_ARABIC_WORDS)
    quality = main.assess_text_quality(text, speech_text)
    return {
        "characters": len(text),
        "speechCharacters": len(speech_text),
        "arabicWords": len(arabic_words),
        "placeholderCharacters": placeholder_count,
        "placeholderRatio": round(placeholder_count / max(len(speech_text), 1), 4),
        "commonArabicWords": common_hits,
        "commonArabicWordRatio": round(common_hits / max(len(arabic_words), 1), 4),
        "singleArabicWords": int(quality["metrics"]["singleArabicWords"]),
        "singleArabicWordRatio": quality["metrics"]["singleArabicWordRatio"],
        "fragmentLines": int(quality["metrics"]["fragmentLines"]),
        "fragmentLineRatio": quality["metrics"]["fragmentLineRatio"],
        "quality": quality["quality"],
        "qualityScore": quality["score"],
        "qualityReasons": quality["reasons"],
        "preview": text[:180],
        "speechPreview": speech_text[:180],
    }


def recommendation_for_extraction(extraction: str | None) -> dict[str, Any] | None:
    if not extraction:
        return None
    match = EXTRACTION_RE.match(extraction)
    if not match:
        return None

    engine = match.group("engine")
    zoom = match.group("zoom")
    psm = match.group("psm")
    env: dict[str, str] = {}
    notes: list[str] = []

    if engine == "embedded":
        return {
            "summary": "This PDF has readable embedded text; OCR settings are not needed.",
            "env": {},
            "notes": ["Use the normal upload flow."],
        }

    mode = match.group("mode")
    if mode in {"arabic", "arabic-max"}:
        env["OCR_ENGINE"] = mode
        if mode == "arabic-max":
            notes.append("Use Maximum Arabic OCR for the full run only if the worker can handle the heavier OCR comparison.")
        else:
            notes.append("Use Arabic specialist OCR for the full run; it compares Arabic-trained OCR outputs.")
        return {
            "summary": f"For the full book, use OCR_ENGINE={mode}.",
            "env": env,
            "notes": notes,
        }

    if engine not in {
        "easyocr",
        "qari-ocr",
        "tawkeed-ocr",
        "katib-ocr",
        "arabic-qwen-ocr",
        "arabic-glm-ocr",
        "baseer-ocr",
        "paddleocr",
        "paddleocr-vl",
        "surya",
        "tesseract",
    }:
        return None

    env["OCR_ENGINE"] = engine
    if zoom:
        if engine == "easyocr":
            env["EASYOCR_RENDER_ZOOM"] = zoom
        elif engine == "qari-ocr":
            env["QARI_OCR_RENDER_ZOOM"] = zoom
        elif engine == "tawkeed-ocr":
            env["TAWKEED_OCR_RENDER_ZOOM"] = zoom
        elif engine == "katib-ocr":
            env["KATIB_OCR_RENDER_ZOOM"] = zoom
        elif engine == "arabic-qwen-ocr":
            env["ARABIC_QWEN_OCR_RENDER_ZOOM"] = zoom
        elif engine == "arabic-glm-ocr":
            env["ARABIC_GLM_OCR_RENDER_ZOOM"] = zoom
        elif engine == "baseer-ocr":
            env["BASEER_OCR_RENDER_ZOOM"] = zoom
        elif engine == "paddleocr":
            env["PADDLEOCR_RENDER_ZOOM"] = zoom
        elif engine == "paddleocr-vl":
            env["PADDLEOCR_VL_RENDER_ZOOM"] = zoom
        elif engine == "surya":
            env["SURYA_RENDER_ZOOM"] = zoom
        elif engine == "tesseract":
            env["OCR_RENDER_ZOOM"] = zoom
    if psm and engine == "tesseract":
        env["TESSERACT_PSM"] = psm
    if engine == "tesseract-fast":
        env["OCR_ENGINE"] = engine
        env["OCR_RENDER_ZOOM"] = zoom or "1.5"
        env["TESSERACT_PSM"] = psm or "6"
        notes.append("Use this runner-up setting when speed matters and its sample text still sounds correct.")
        return {
            "summary": "For the full book, use OCR_ENGINE=tesseract-fast OCR_RENDER_ZOOM=1.5 TESSERACT_PSM=6.",
            "env": env,
            "notes": notes,
        }
    if engine == "tesseract":
        notes.append("Confirm Tesseract Arabic data is installed before the full run.")
    elif engine == "easyocr":
        notes.append("Use the EasyOCR/SILMA sidecar environment for the full run.")
    elif engine == "qari-ocr":
        notes.append("Use the QARI-OCR Arabic VLM sidecar on a GPU or strong worker; expect much higher RAM/runtime.")
    elif engine == "tawkeed-ocr":
        notes.append("Use the Tawkeed Arabic OCR sidecar when QARI 4B is too heavy; benchmark it on a short sample first.")
    elif engine == "katib-ocr":
        notes.append("Use the KATIB Arabic OCR sidecar for a smaller Arabic-trained VLM; benchmark it on a short sample first.")
    elif engine == "arabic-qwen-ocr":
        notes.append("Use the Arabic-Qwen3.5 OCR sidecar for a 0.9B Arabic-trained VLM; benchmark it on a short sample first.")
    elif engine == "arabic-glm-ocr":
        notes.append("Use the Arabic-GLM OCR sidecar for a recent Arabic-trained OCR VLM; benchmark it on a short sample first.")
    elif engine == "baseer-ocr":
        notes.append("Use the Baseer Arabic OCR sidecar for complex Arabic document layouts; benchmark it on a short sample first.")
    elif engine == "paddleocr":
        notes.append("Use the PaddleOCR sidecar environment for the full run.")
    elif engine == "paddleocr-vl":
        notes.append("Use the PaddleOCR-VL sidecar on a strong worker; expect much higher RAM/runtime than PaddleOCR.")
    elif engine == "surya":
        notes.append("Use the Surya heavy-worker sidecar; expect higher RAM/runtime than PaddleOCR.")

    env_text = " ".join(f"{key}={value}" for key, value in env.items())
    return {
        "summary": f"For the full book, use {env_text}.",
        "env": env,
        "notes": notes,
    }


def benchmark_engine(pdf_path: Path, engine: str) -> dict[str, Any]:
    previous_engine = main.OCR_ENGINE
    main.OCR_ENGINE = engine
    job = main.Job(id="dry-run", filename=pdf_path.name, ocr_engine=engine)
    started = time.perf_counter()
    try:
        if engine == "tesseract-fast":
            text = main.ocr_pdf_text_with_tesseract(pdf_path, job, render_zoom=1.5, psm=6)
            job.ocr_engine = engine
        else:
            text = main.extract_pdf_text(pdf_path, job)
        elapsed = round(time.perf_counter() - started, 2)
        result = {
            "engine": engine,
            "ok": True,
            "seconds": elapsed,
            "pages": job.pages,
            "extraction": job.extraction,
            **text_metrics(text),
        }
        if engine == "tesseract-fast":
            result["recommendation"] = {
                "summary": "For the full book, use OCR_ENGINE=tesseract-fast.",
                "env": {
                    "OCR_ENGINE": "tesseract-fast",
                    "OCR_RENDER_ZOOM": "1.5",
                    "TESSERACT_PSM": "6",
                },
                "notes": ["Use this runner-up setting when speed matters and its sample text still sounds correct."],
            }
        else:
            result["recommendation"] = recommendation_for_extraction(job.extraction)
        return result
    except Exception as exc:
        elapsed = round(time.perf_counter() - started, 2)
        return {
            "engine": engine,
            "ok": False,
            "seconds": elapsed,
            "pages": job.pages,
            "error": str(exc),
        }
    finally:
        main.OCR_ENGINE = previous_engine


def print_table(results: list[dict[str, Any]]) -> None:
    print("engine         ok    sec    pages  chars   words   quality  score    extraction")
    print("-------------  ----  -----  -----  ------  ------  -------  -------  ----------")
    for item in results:
        print(
            f"{item['engine']:<13}  "
            f"{str(item['ok']):<4}  "
            f"{item['seconds']:>5}  "
            f"{item.get('pages', 0):>5}  "
            f"{item.get('characters', 0):>6}  "
            f"{item.get('arabicWords', 0):>6}  "
            f"{item.get('quality', '-'):>7}  "
            f"{item.get('qualityScore', 0):>7}  "
            f"{item.get('extraction', '-')}"
        )
    successful = [item for item in results if item.get("ok")]
    if successful:
        best = max(successful, key=lambda item: (item.get("qualityScore", 0), item.get("arabicWords", 0)))
        recommendation = best.get("recommendation")
        if recommendation:
            print()
            print(f"Best full-book setting from this sample: {recommendation['summary']}")


def main_cli() -> None:
    if hasattr(sys.stdout, "reconfigure"):
        sys.stdout.reconfigure(encoding="utf-8", errors="replace")
    if hasattr(sys.stderr, "reconfigure"):
        sys.stderr.reconfigure(encoding="utf-8", errors="replace")

    parser = argparse.ArgumentParser(description="Benchmark Arabic OCR engines on the same PDF.")
    parser.add_argument("pdf", type=Path, help="Arabic PDF to benchmark")
    parser.add_argument(
        "--engines",
        nargs="+",
        default=["easyocr", "paddleocr", "tesseract"],
        choices=[
            "arabic",
            "arabic-max",
            "qari-ocr",
            "tawkeed-ocr",
            "katib-ocr",
            "arabic-qwen-ocr",
            "arabic-glm-ocr",
            "baseer-ocr",
            "easyocr",
            "paddleocr",
            "paddleocr-vl",
            "surya",
            "tesseract",
            "tesseract-fast",
            "auto",
            "best",
        ],
    )
    parser.add_argument("--page-limit", type=int, default=1, help="Benchmark only the first N pages by default.")
    parser.add_argument("--json", action="store_true", help="Print full JSON results instead of a compact table.")
    args = parser.parse_args()

    if not args.pdf.exists():
        raise FileNotFoundError(f"PDF not found: {args.pdf}")
    if args.page_limit is not None and args.page_limit < 1:
        raise ValueError("--page-limit must be 1 or greater.")

    benchmark_pdf = make_limited_pdf(args.pdf, args.page_limit)
    try:
        results = [benchmark_engine(benchmark_pdf, engine) for engine in args.engines]
    finally:
        if benchmark_pdf != args.pdf:
            benchmark_pdf.unlink(missing_ok=True)

    if args.json:
        print(json.dumps(results, ensure_ascii=False, indent=2))
    else:
        print_table(results)
        print()
        print("Tip: use --json to inspect text previews and errors.")


if __name__ == "__main__":
    main_cli()