File size: 13,862 Bytes
2e1a095
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
from __future__ import annotations

import argparse
import json
import os
import platform
import sqlite3
import subprocess
import sys
import tempfile
from dataclasses import asdict, dataclass
from pathlib import Path
from typing import Literal

ROOT_DIR = Path(__file__).resolve().parent.parent
if str(ROOT_DIR) not in sys.path:
    sys.path.insert(0, str(ROOT_DIR))

from app import main

Status = Literal["PASS", "WARN", "FAIL"]


@dataclass
class Check:
    category: str
    name: str
    status: Status
    detail: str


def env_is_set(name: str) -> bool:
    return bool(os.getenv(name))


def can_write_to_dir(path: Path) -> bool:
    try:
        path.mkdir(parents=True, exist_ok=True)
        with tempfile.NamedTemporaryFile(prefix=".preflight-", dir=path, delete=True) as handle:
            handle.write(b"ok")
        return True
    except OSError:
        return False


def can_open_sqlite(path: Path) -> bool:
    try:
        path.parent.mkdir(parents=True, exist_ok=True)
        connection = sqlite3.connect(path)
        connection.execute("select 1")
        connection.close()
        return True
    except sqlite3.Error:
        return False
    except OSError:
        return False


def run_version(command: list[str], timeout: int = 8) -> str | None:
    try:
        result = subprocess.run(
            command,
            capture_output=True,
            text=True,
            encoding="utf-8",
            errors="replace",
            timeout=timeout,
        )
    except (OSError, subprocess.TimeoutExpired):
        return None
    output = (result.stdout or result.stderr).strip().splitlines()
    return output[0].strip() if result.returncode == 0 and output else None


def mask_path(path: str | Path | None) -> str:
    if not path:
        return "not configured"
    return str(path)


def collect_checks() -> list[Check]:
    checks: list[Check] = []

    python_ok = sys.version_info >= (3, 10)
    checks.append(
        Check(
            "Runtime",
            "Python",
            "PASS" if python_ok else "FAIL",
            f"{platform.python_version()} ({'supported' if python_ok else 'requires 3.10+'})",
        )
    )
    checks.append(Check("Runtime", "PyMuPDF", "PASS", f"fitz {getattr(main.fitz, 'version', ['unknown'])[0]}"))

    access_status: Status = "PASS" if main.ACCESS_CODE != "1234" else "WARN"
    checks.append(
        Check(
            "Security",
            "Access code",
            access_status,
            "configured; using development code 1234" if main.ACCESS_CODE == "1234" else "configured",
        )
    )
    secret_status: Status = "PASS" if main.SECRET_KEY != "dev-secret-change-me" else "WARN"
    checks.append(
        Check(
            "Security",
            "Secret key",
            secret_status,
            "configured" if secret_status == "PASS" else "using development fallback; set SECRET_KEY in .env/Vercel",
        )
    )

    for label, path in (
        ("Uploads directory", main.UPLOAD_DIR),
        ("Outputs directory", main.OUTPUT_DIR),
        ("Data directory", main.DATA_DIR),
    ):
        writable = can_write_to_dir(path)
        checks.append(Check("Storage", label, "PASS" if writable else "FAIL", mask_path(path)))

    db_ok = can_open_sqlite(main.DB_PATH)
    checks.append(Check("Storage", "SQLite database", "PASS" if db_ok else "FAIL", mask_path(main.DB_PATH)))
    retention_ok = main.OUTPUT_RETENTION_DAYS >= 1 and main.OUTPUT_MAX_FILES >= 1
    checks.append(
        Check(
            "Storage",
            "Audio retention",
            "PASS" if retention_ok else "WARN",
            f"{main.OUTPUT_RETENTION_DAYS} days, newest {main.OUTPUT_MAX_FILES} files",
        )
    )

    easyocr_python = main.find_easyocr_python()
    paddleocr_python = main.find_paddleocr_python()
    paddleocr_vl_python = main.find_paddleocr_vl_python()
    qari_ocr_python = main.find_qari_ocr_python()
    tawkeed_ocr_python = main.find_tawkeed_ocr_python()
    katib_ocr_python = main.find_katib_ocr_python()
    arabic_qwen_ocr_python = main.find_arabic_qwen_ocr_python()
    arabic_glm_ocr_python = main.find_arabic_glm_ocr_python()
    baseer_ocr_python = main.find_baseer_ocr_python()
    surya_python = main.find_surya_python()
    tesseract_path = main.find_tesseract()
    tessdata_dir = main.get_tessdata_dir()
    preferred_ocr = main.get_preferred_ocr_engine(
        bool(easyocr_python),
        bool(paddleocr_python),
        bool(paddleocr_vl_python),
        bool(qari_ocr_python),
        bool(tawkeed_ocr_python),
        bool(katib_ocr_python),
        bool(arabic_qwen_ocr_python),
        bool(arabic_glm_ocr_python),
        bool(baseer_ocr_python),
        bool(surya_python),
        bool(tesseract_path),
    )
    checks.append(Check("OCR", "Embedded PDF text", "PASS", "available through PyMuPDF"))
    trained_arabic_ready = bool(
        qari_ocr_python
        or tawkeed_ocr_python
        or katib_ocr_python
        or arabic_qwen_ocr_python
        or arabic_glm_ocr_python
        or baseer_ocr_python
        or paddleocr_python
    )
    checks.append(
        Check(
            "OCR",
            "Arabic-trained scanned PDF OCR",
            "PASS" if trained_arabic_ready else "WARN",
            "QARI/Tawkeed/KATIB/Arabic-Qwen/Arabic-GLM/Baseer/PaddleOCR Arabic available"
            if trained_arabic_ready
            else "not installed; run scripts/setup_paddleocr.ps1 first, then optionally scripts/setup_tawkeed_ocr.ps1, scripts/setup_katib_ocr.ps1, scripts/setup_arabic_qwen_ocr.ps1, scripts/setup_arabic_glm_ocr.ps1, scripts/setup_baseer_ocr.ps1, or scripts/setup_qari_ocr.ps1",
        )
    )
    checks.append(
        Check(
            "OCR",
            "EasyOCR Arabic",
            "PASS" if easyocr_python else "WARN",
            mask_path(easyocr_python) if easyocr_python else "not installed; run scripts/setup_silma.ps1",
        )
    )
    checks.append(
        Check(
            "OCR",
            "PaddleOCR Arabic",
            "PASS" if paddleocr_python else "WARN",
            mask_path(paddleocr_python) if paddleocr_python else "Arabic specialist OCR uses this model; run scripts/setup_paddleocr.ps1",
        )
    )
    checks.append(
        Check(
            "OCR",
            "QARI-OCR Arabic book VLM",
            "PASS" if qari_ocr_python else "WARN",
            mask_path(qari_ocr_python) if qari_ocr_python else "optional Arabic-book heavy OCR; run scripts/setup_qari_ocr.ps1",
        )
    )
    checks.append(
        Check(
            "OCR",
            "Tawkeed Arabic OCR VLM",
            "PASS" if tawkeed_ocr_python else "WARN",
            mask_path(tawkeed_ocr_python) if tawkeed_ocr_python else "optional Arabic-first OCR; run scripts/setup_tawkeed_ocr.ps1",
        )
    )
    checks.append(
        Check(
            "OCR",
            "KATIB Arabic OCR VLM",
            "PASS" if katib_ocr_python else "WARN",
            mask_path(katib_ocr_python) if katib_ocr_python else "optional smaller Arabic-trained OCR; run scripts/setup_katib_ocr.ps1",
        )
    )
    checks.append(
        Check(
            "OCR",
            "Arabic-Qwen3.5 OCR VLM",
            "PASS" if arabic_qwen_ocr_python else "WARN",
            mask_path(arabic_qwen_ocr_python)
            if arabic_qwen_ocr_python
            else "optional 0.9B Arabic OCR VLM; run scripts/setup_arabic_qwen_ocr.ps1",
        )
    )
    checks.append(
        Check(
            "OCR",
            "Arabic-GLM OCR VLM",
            "PASS" if arabic_glm_ocr_python else "WARN",
            mask_path(arabic_glm_ocr_python)
            if arabic_glm_ocr_python
            else "optional recent Arabic OCR VLM; run scripts/setup_arabic_glm_ocr.ps1",
        )
    )
    checks.append(
        Check(
            "OCR",
            "Baseer Arabic OCR VLM",
            "PASS" if baseer_ocr_python else "WARN",
            mask_path(baseer_ocr_python)
            if baseer_ocr_python
            else "optional Arabic document OCR VLM; run scripts/setup_baseer_ocr.ps1",
        )
    )
    checks.append(
        Check(
            "OCR",
            "PaddleOCR-VL heavy worker",
            "PASS" if paddleocr_vl_python else "WARN",
            mask_path(paddleocr_vl_python) if paddleocr_vl_python else "optional heavy OCR; run scripts/setup_paddleocr_vl.ps1",
        )
    )
    checks.append(
        Check(
            "OCR",
            "Surya OCR heavy worker",
            "PASS" if surya_python else "WARN",
            mask_path(surya_python) if surya_python else "optional high-accuracy heavy OCR; run scripts/setup_surya.ps1",
        )
    )
    checks.append(
        Check(
            "OCR",
            "Tesseract Arabic",
            "PASS" if tesseract_path and tessdata_dir else "WARN",
            f"{mask_path(tesseract_path)}; tessdata={mask_path(tessdata_dir)}"
            if tesseract_path and tessdata_dir
            else "optional fallback; install Tesseract with ara.traineddata",
        )
    )
    checks.append(
        Check(
            "OCR",
            "Preferred OCR mode",
            "PASS" if preferred_ocr else "WARN",
            preferred_ocr or "only embedded text is available; scanned PDFs need Arabic OCR",
        )
    )

    silma_python = main.find_silma_python()
    habibi_python = main.find_habibi_python()
    supertonic_python = main.find_supertonic_python()
    espeak_path = main.find_espeak_ng()
    piper_path = main.shutil.which("piper")
    piper_model = Path(main.PIPER_MODEL).exists() if main.PIPER_MODEL else False
    pyttsx3_ready = main.importlib.util.find_spec("pyttsx3") is not None
    checks.append(
        Check(
            "TTS",
            "SILMA Arabic voice",
            "PASS" if silma_python else "WARN",
            mask_path(silma_python) if silma_python else "best free Arabic voice not installed; run scripts/setup_silma.ps1",
        )
    )
    checks.append(
        Check(
            "TTS",
            "Habibi Arabic MSA voice",
            "PASS" if habibi_python else "WARN",
            mask_path(habibi_python) if habibi_python else "optional newer MSA voice; run scripts/setup_habibi.ps1",
        )
    )
    checks.append(
        Check(
            "TTS",
            "Supertonic Arabic CPU voice",
            "PASS" if supertonic_python else "WARN",
            mask_path(supertonic_python) if supertonic_python else "optional fast CPU voice; run scripts/setup_supertonic.ps1",
        )
    )
    checks.append(
        Check(
            "TTS",
            "eSpeak NG fallback",
            "PASS" if espeak_path else "WARN",
            mask_path(espeak_path) if espeak_path else "not installed; useful fast fallback",
        )
    )
    checks.append(
        Check(
            "TTS",
            "Piper voice",
            "PASS" if piper_path and piper_model else "WARN",
            f"{mask_path(piper_path)}; model={mask_path(main.PIPER_MODEL)}" if piper_path else "optional; no Arabic Piper model configured",
        )
    )
    checks.append(Check("TTS", "pyttsx3 fallback", "PASS" if pyttsx3_ready else "WARN", "available" if pyttsx3_ready else "not importable"))

    ffmpeg_path = main.find_ffmpeg()
    audio_format = main.AUDIO_FORMAT if main.AUDIO_FORMAT in {"wav", "mp3"} else "wav"
    ffmpeg_status: Status = "PASS" if audio_format == "wav" or ffmpeg_path else "WARN"
    checks.append(
        Check(
            "Audio",
            "Output format",
            ffmpeg_status,
            f"{audio_format}; ffmpeg={mask_path(ffmpeg_path)}",
        )
    )

    if main.IS_VERCEL:
        deployment_status: Status = "PASS" if main.WORKER_BASE_URL else "WARN"
        deployment_detail = "worker configured" if main.WORKER_BASE_URL else "Vercel mode needs WORKER_BASE_URL for 100 MB+ PDFs"
    else:
        deployment_status = "PASS"
        deployment_detail = "local mode"
    checks.append(Check("Deployment", "Worker URL", deployment_status, deployment_detail))
    checks.append(
        Check(
            "Deployment",
            "CORS origins",
            "PASS" if main.CORS_ORIGINS or not main.WORKER_BASE_URL else "WARN",
            ", ".join(main.CORS_ORIGINS) if main.CORS_ORIGINS else "not configured",
        )
    )

    return checks


def summarize(checks: list[Check]) -> dict[str, object]:
    counts = {"PASS": 0, "WARN": 0, "FAIL": 0}
    for check in checks:
        counts[check.status] += 1
    ready = counts["FAIL"] == 0
    return {"ready": ready, "counts": counts, "checks": [asdict(check) for check in checks]}


def print_table(checks: list[Check]) -> None:
    summary = summarize(checks)
    print("Arabic PDF Reader preflight")
    print(f"Ready: {'yes' if summary['ready'] else 'no'}  PASS={summary['counts']['PASS']} WARN={summary['counts']['WARN']} FAIL={summary['counts']['FAIL']}")
    print()
    for check in checks:
        print(f"[{check.status:<4}] {check.category:<10} {check.name:<20} {check.detail}")


def exit_code(checks: list[Check], strict: bool = False) -> int:
    if any(check.status == "FAIL" for check in checks):
        return 1
    if strict and any(check.status == "WARN" for check in checks):
        return 2
    return 0


def main_cli() -> int:
    parser = argparse.ArgumentParser(description="Check local OCR, TTS, storage, and deployment readiness.")
    parser.add_argument("--json", action="store_true", help="Print machine-readable JSON.")
    parser.add_argument("--strict", action="store_true", help="Return non-zero when warnings are present.")
    args = parser.parse_args()

    checks = collect_checks()
    if args.json:
        print(json.dumps(summarize(checks), ensure_ascii=False, indent=2))
    else:
        print_table(checks)
    return exit_code(checks, strict=args.strict)


if __name__ == "__main__":
    raise SystemExit(main_cli())