arabic-audio-reader-worker / scripts /preflight_check.py
Syncre's picture
Deploy Arabic Audio Reader worker
2e1a095 verified
from __future__ import annotations
import argparse
import json
import os
import platform
import sqlite3
import subprocess
import sys
import tempfile
from dataclasses import asdict, dataclass
from pathlib import Path
from typing import Literal
ROOT_DIR = Path(__file__).resolve().parent.parent
if str(ROOT_DIR) not in sys.path:
sys.path.insert(0, str(ROOT_DIR))
from app import main
Status = Literal["PASS", "WARN", "FAIL"]
@dataclass
class Check:
category: str
name: str
status: Status
detail: str
def env_is_set(name: str) -> bool:
return bool(os.getenv(name))
def can_write_to_dir(path: Path) -> bool:
try:
path.mkdir(parents=True, exist_ok=True)
with tempfile.NamedTemporaryFile(prefix=".preflight-", dir=path, delete=True) as handle:
handle.write(b"ok")
return True
except OSError:
return False
def can_open_sqlite(path: Path) -> bool:
try:
path.parent.mkdir(parents=True, exist_ok=True)
connection = sqlite3.connect(path)
connection.execute("select 1")
connection.close()
return True
except sqlite3.Error:
return False
except OSError:
return False
def run_version(command: list[str], timeout: int = 8) -> str | None:
try:
result = subprocess.run(
command,
capture_output=True,
text=True,
encoding="utf-8",
errors="replace",
timeout=timeout,
)
except (OSError, subprocess.TimeoutExpired):
return None
output = (result.stdout or result.stderr).strip().splitlines()
return output[0].strip() if result.returncode == 0 and output else None
def mask_path(path: str | Path | None) -> str:
if not path:
return "not configured"
return str(path)
def collect_checks() -> list[Check]:
checks: list[Check] = []
python_ok = sys.version_info >= (3, 10)
checks.append(
Check(
"Runtime",
"Python",
"PASS" if python_ok else "FAIL",
f"{platform.python_version()} ({'supported' if python_ok else 'requires 3.10+'})",
)
)
checks.append(Check("Runtime", "PyMuPDF", "PASS", f"fitz {getattr(main.fitz, 'version', ['unknown'])[0]}"))
access_status: Status = "PASS" if main.ACCESS_CODE != "1234" else "WARN"
checks.append(
Check(
"Security",
"Access code",
access_status,
"configured; using development code 1234" if main.ACCESS_CODE == "1234" else "configured",
)
)
secret_status: Status = "PASS" if main.SECRET_KEY != "dev-secret-change-me" else "WARN"
checks.append(
Check(
"Security",
"Secret key",
secret_status,
"configured" if secret_status == "PASS" else "using development fallback; set SECRET_KEY in .env/Vercel",
)
)
for label, path in (
("Uploads directory", main.UPLOAD_DIR),
("Outputs directory", main.OUTPUT_DIR),
("Data directory", main.DATA_DIR),
):
writable = can_write_to_dir(path)
checks.append(Check("Storage", label, "PASS" if writable else "FAIL", mask_path(path)))
db_ok = can_open_sqlite(main.DB_PATH)
checks.append(Check("Storage", "SQLite database", "PASS" if db_ok else "FAIL", mask_path(main.DB_PATH)))
retention_ok = main.OUTPUT_RETENTION_DAYS >= 1 and main.OUTPUT_MAX_FILES >= 1
checks.append(
Check(
"Storage",
"Audio retention",
"PASS" if retention_ok else "WARN",
f"{main.OUTPUT_RETENTION_DAYS} days, newest {main.OUTPUT_MAX_FILES} files",
)
)
easyocr_python = main.find_easyocr_python()
paddleocr_python = main.find_paddleocr_python()
paddleocr_vl_python = main.find_paddleocr_vl_python()
qari_ocr_python = main.find_qari_ocr_python()
tawkeed_ocr_python = main.find_tawkeed_ocr_python()
katib_ocr_python = main.find_katib_ocr_python()
arabic_qwen_ocr_python = main.find_arabic_qwen_ocr_python()
arabic_glm_ocr_python = main.find_arabic_glm_ocr_python()
baseer_ocr_python = main.find_baseer_ocr_python()
surya_python = main.find_surya_python()
tesseract_path = main.find_tesseract()
tessdata_dir = main.get_tessdata_dir()
preferred_ocr = main.get_preferred_ocr_engine(
bool(easyocr_python),
bool(paddleocr_python),
bool(paddleocr_vl_python),
bool(qari_ocr_python),
bool(tawkeed_ocr_python),
bool(katib_ocr_python),
bool(arabic_qwen_ocr_python),
bool(arabic_glm_ocr_python),
bool(baseer_ocr_python),
bool(surya_python),
bool(tesseract_path),
)
checks.append(Check("OCR", "Embedded PDF text", "PASS", "available through PyMuPDF"))
trained_arabic_ready = bool(
qari_ocr_python
or tawkeed_ocr_python
or katib_ocr_python
or arabic_qwen_ocr_python
or arabic_glm_ocr_python
or baseer_ocr_python
or paddleocr_python
)
checks.append(
Check(
"OCR",
"Arabic-trained scanned PDF OCR",
"PASS" if trained_arabic_ready else "WARN",
"QARI/Tawkeed/KATIB/Arabic-Qwen/Arabic-GLM/Baseer/PaddleOCR Arabic available"
if trained_arabic_ready
else "not installed; run scripts/setup_paddleocr.ps1 first, then optionally scripts/setup_tawkeed_ocr.ps1, scripts/setup_katib_ocr.ps1, scripts/setup_arabic_qwen_ocr.ps1, scripts/setup_arabic_glm_ocr.ps1, scripts/setup_baseer_ocr.ps1, or scripts/setup_qari_ocr.ps1",
)
)
checks.append(
Check(
"OCR",
"EasyOCR Arabic",
"PASS" if easyocr_python else "WARN",
mask_path(easyocr_python) if easyocr_python else "not installed; run scripts/setup_silma.ps1",
)
)
checks.append(
Check(
"OCR",
"PaddleOCR Arabic",
"PASS" if paddleocr_python else "WARN",
mask_path(paddleocr_python) if paddleocr_python else "Arabic specialist OCR uses this model; run scripts/setup_paddleocr.ps1",
)
)
checks.append(
Check(
"OCR",
"QARI-OCR Arabic book VLM",
"PASS" if qari_ocr_python else "WARN",
mask_path(qari_ocr_python) if qari_ocr_python else "optional Arabic-book heavy OCR; run scripts/setup_qari_ocr.ps1",
)
)
checks.append(
Check(
"OCR",
"Tawkeed Arabic OCR VLM",
"PASS" if tawkeed_ocr_python else "WARN",
mask_path(tawkeed_ocr_python) if tawkeed_ocr_python else "optional Arabic-first OCR; run scripts/setup_tawkeed_ocr.ps1",
)
)
checks.append(
Check(
"OCR",
"KATIB Arabic OCR VLM",
"PASS" if katib_ocr_python else "WARN",
mask_path(katib_ocr_python) if katib_ocr_python else "optional smaller Arabic-trained OCR; run scripts/setup_katib_ocr.ps1",
)
)
checks.append(
Check(
"OCR",
"Arabic-Qwen3.5 OCR VLM",
"PASS" if arabic_qwen_ocr_python else "WARN",
mask_path(arabic_qwen_ocr_python)
if arabic_qwen_ocr_python
else "optional 0.9B Arabic OCR VLM; run scripts/setup_arabic_qwen_ocr.ps1",
)
)
checks.append(
Check(
"OCR",
"Arabic-GLM OCR VLM",
"PASS" if arabic_glm_ocr_python else "WARN",
mask_path(arabic_glm_ocr_python)
if arabic_glm_ocr_python
else "optional recent Arabic OCR VLM; run scripts/setup_arabic_glm_ocr.ps1",
)
)
checks.append(
Check(
"OCR",
"Baseer Arabic OCR VLM",
"PASS" if baseer_ocr_python else "WARN",
mask_path(baseer_ocr_python)
if baseer_ocr_python
else "optional Arabic document OCR VLM; run scripts/setup_baseer_ocr.ps1",
)
)
checks.append(
Check(
"OCR",
"PaddleOCR-VL heavy worker",
"PASS" if paddleocr_vl_python else "WARN",
mask_path(paddleocr_vl_python) if paddleocr_vl_python else "optional heavy OCR; run scripts/setup_paddleocr_vl.ps1",
)
)
checks.append(
Check(
"OCR",
"Surya OCR heavy worker",
"PASS" if surya_python else "WARN",
mask_path(surya_python) if surya_python else "optional high-accuracy heavy OCR; run scripts/setup_surya.ps1",
)
)
checks.append(
Check(
"OCR",
"Tesseract Arabic",
"PASS" if tesseract_path and tessdata_dir else "WARN",
f"{mask_path(tesseract_path)}; tessdata={mask_path(tessdata_dir)}"
if tesseract_path and tessdata_dir
else "optional fallback; install Tesseract with ara.traineddata",
)
)
checks.append(
Check(
"OCR",
"Preferred OCR mode",
"PASS" if preferred_ocr else "WARN",
preferred_ocr or "only embedded text is available; scanned PDFs need Arabic OCR",
)
)
silma_python = main.find_silma_python()
habibi_python = main.find_habibi_python()
supertonic_python = main.find_supertonic_python()
espeak_path = main.find_espeak_ng()
piper_path = main.shutil.which("piper")
piper_model = Path(main.PIPER_MODEL).exists() if main.PIPER_MODEL else False
pyttsx3_ready = main.importlib.util.find_spec("pyttsx3") is not None
checks.append(
Check(
"TTS",
"SILMA Arabic voice",
"PASS" if silma_python else "WARN",
mask_path(silma_python) if silma_python else "best free Arabic voice not installed; run scripts/setup_silma.ps1",
)
)
checks.append(
Check(
"TTS",
"Habibi Arabic MSA voice",
"PASS" if habibi_python else "WARN",
mask_path(habibi_python) if habibi_python else "optional newer MSA voice; run scripts/setup_habibi.ps1",
)
)
checks.append(
Check(
"TTS",
"Supertonic Arabic CPU voice",
"PASS" if supertonic_python else "WARN",
mask_path(supertonic_python) if supertonic_python else "optional fast CPU voice; run scripts/setup_supertonic.ps1",
)
)
checks.append(
Check(
"TTS",
"eSpeak NG fallback",
"PASS" if espeak_path else "WARN",
mask_path(espeak_path) if espeak_path else "not installed; useful fast fallback",
)
)
checks.append(
Check(
"TTS",
"Piper voice",
"PASS" if piper_path and piper_model else "WARN",
f"{mask_path(piper_path)}; model={mask_path(main.PIPER_MODEL)}" if piper_path else "optional; no Arabic Piper model configured",
)
)
checks.append(Check("TTS", "pyttsx3 fallback", "PASS" if pyttsx3_ready else "WARN", "available" if pyttsx3_ready else "not importable"))
ffmpeg_path = main.find_ffmpeg()
audio_format = main.AUDIO_FORMAT if main.AUDIO_FORMAT in {"wav", "mp3"} else "wav"
ffmpeg_status: Status = "PASS" if audio_format == "wav" or ffmpeg_path else "WARN"
checks.append(
Check(
"Audio",
"Output format",
ffmpeg_status,
f"{audio_format}; ffmpeg={mask_path(ffmpeg_path)}",
)
)
if main.IS_VERCEL:
deployment_status: Status = "PASS" if main.WORKER_BASE_URL else "WARN"
deployment_detail = "worker configured" if main.WORKER_BASE_URL else "Vercel mode needs WORKER_BASE_URL for 100 MB+ PDFs"
else:
deployment_status = "PASS"
deployment_detail = "local mode"
checks.append(Check("Deployment", "Worker URL", deployment_status, deployment_detail))
checks.append(
Check(
"Deployment",
"CORS origins",
"PASS" if main.CORS_ORIGINS or not main.WORKER_BASE_URL else "WARN",
", ".join(main.CORS_ORIGINS) if main.CORS_ORIGINS else "not configured",
)
)
return checks
def summarize(checks: list[Check]) -> dict[str, object]:
counts = {"PASS": 0, "WARN": 0, "FAIL": 0}
for check in checks:
counts[check.status] += 1
ready = counts["FAIL"] == 0
return {"ready": ready, "counts": counts, "checks": [asdict(check) for check in checks]}
def print_table(checks: list[Check]) -> None:
summary = summarize(checks)
print("Arabic PDF Reader preflight")
print(f"Ready: {'yes' if summary['ready'] else 'no'} PASS={summary['counts']['PASS']} WARN={summary['counts']['WARN']} FAIL={summary['counts']['FAIL']}")
print()
for check in checks:
print(f"[{check.status:<4}] {check.category:<10} {check.name:<20} {check.detail}")
def exit_code(checks: list[Check], strict: bool = False) -> int:
if any(check.status == "FAIL" for check in checks):
return 1
if strict and any(check.status == "WARN" for check in checks):
return 2
return 0
def main_cli() -> int:
parser = argparse.ArgumentParser(description="Check local OCR, TTS, storage, and deployment readiness.")
parser.add_argument("--json", action="store_true", help="Print machine-readable JSON.")
parser.add_argument("--strict", action="store_true", help="Return non-zero when warnings are present.")
args = parser.parse_args()
checks = collect_checks()
if args.json:
print(json.dumps(summarize(checks), ensure_ascii=False, indent=2))
else:
print_table(checks)
return exit_code(checks, strict=args.strict)
if __name__ == "__main__":
raise SystemExit(main_cli())