Syncre's picture
Deploy Arabic Audio Reader worker
2e1a095 verified
from __future__ import annotations
import argparse
from contextlib import contextmanager
import json
import sys
from pathlib import Path
from typing import Iterator
ROOT_DIR = Path(__file__).resolve().parent.parent
if str(ROOT_DIR) not in sys.path:
sys.path.insert(0, str(ROOT_DIR))
from app import main
from scripts.benchmark_ocr import recommendation_for_extraction
OCR_ENV_KEYS = {
"OCR_ENGINE",
"OCR_RENDER_ZOOM",
"EASYOCR_RENDER_ZOOM",
"QARI_OCR_RENDER_ZOOM",
"TAWKEED_OCR_RENDER_ZOOM",
"KATIB_OCR_RENDER_ZOOM",
"ARABIC_QWEN_OCR_RENDER_ZOOM",
"ARABIC_GLM_OCR_RENDER_ZOOM",
"BASEER_OCR_RENDER_ZOOM",
"PADDLEOCR_RENDER_ZOOM",
"PADDLEOCR_VL_RENDER_ZOOM",
"SURYA_RENDER_ZOOM",
"TESSERACT_PSM",
}
def load_ocr_env_file(path: Path | None) -> dict[str, str]:
if path is None:
return {}
if not path.exists():
raise FileNotFoundError(f"OCR env file not found: {path}")
values: dict[str, str] = {}
for raw_line in path.read_text(encoding="utf-8").splitlines():
line = raw_line.strip()
if not line or line.startswith("#") or "=" not in line:
continue
key, value = line.split("=", 1)
key = key.strip()
if key in OCR_ENV_KEYS:
values[key] = value.strip().strip('"').strip("'")
return values
@contextmanager
def temporary_ocr_settings(
ocr_engine: str | None = None,
ocr_render_zoom: str | None = None,
easyocr_render_zoom: str | None = None,
qari_ocr_render_zoom: str | None = None,
tawkeed_ocr_render_zoom: str | None = None,
katib_ocr_render_zoom: str | None = None,
arabic_qwen_ocr_render_zoom: str | None = None,
arabic_glm_ocr_render_zoom: str | None = None,
baseer_ocr_render_zoom: str | None = None,
paddleocr_render_zoom: str | None = None,
paddleocr_vl_render_zoom: str | None = None,
surya_render_zoom: str | None = None,
tesseract_psm: str | None = None,
from_extraction: str | None = None,
env_file: Path | None = None,
) -> Iterator[None]:
file_env = load_ocr_env_file(env_file)
extraction_env: dict[str, str] = {}
if from_extraction:
recommendation = recommendation_for_extraction(from_extraction)
extraction_env = recommendation.get("env", {}) if recommendation else {}
ocr_engine = ocr_engine or extraction_env.get("OCR_ENGINE") or file_env.get("OCR_ENGINE")
ocr_render_zoom = ocr_render_zoom or extraction_env.get("OCR_RENDER_ZOOM") or file_env.get("OCR_RENDER_ZOOM")
easyocr_render_zoom = easyocr_render_zoom or extraction_env.get("EASYOCR_RENDER_ZOOM") or file_env.get("EASYOCR_RENDER_ZOOM")
qari_ocr_render_zoom = qari_ocr_render_zoom or extraction_env.get("QARI_OCR_RENDER_ZOOM") or file_env.get("QARI_OCR_RENDER_ZOOM")
tawkeed_ocr_render_zoom = (
tawkeed_ocr_render_zoom
or extraction_env.get("TAWKEED_OCR_RENDER_ZOOM")
or file_env.get("TAWKEED_OCR_RENDER_ZOOM")
)
katib_ocr_render_zoom = (
katib_ocr_render_zoom or extraction_env.get("KATIB_OCR_RENDER_ZOOM") or file_env.get("KATIB_OCR_RENDER_ZOOM")
)
arabic_qwen_ocr_render_zoom = (
arabic_qwen_ocr_render_zoom
or extraction_env.get("ARABIC_QWEN_OCR_RENDER_ZOOM")
or file_env.get("ARABIC_QWEN_OCR_RENDER_ZOOM")
)
arabic_glm_ocr_render_zoom = (
arabic_glm_ocr_render_zoom
or extraction_env.get("ARABIC_GLM_OCR_RENDER_ZOOM")
or file_env.get("ARABIC_GLM_OCR_RENDER_ZOOM")
)
baseer_ocr_render_zoom = (
baseer_ocr_render_zoom or extraction_env.get("BASEER_OCR_RENDER_ZOOM") or file_env.get("BASEER_OCR_RENDER_ZOOM")
)
paddleocr_render_zoom = paddleocr_render_zoom or extraction_env.get("PADDLEOCR_RENDER_ZOOM") or file_env.get("PADDLEOCR_RENDER_ZOOM")
paddleocr_vl_render_zoom = (
paddleocr_vl_render_zoom
or extraction_env.get("PADDLEOCR_VL_RENDER_ZOOM")
or file_env.get("PADDLEOCR_VL_RENDER_ZOOM")
)
surya_render_zoom = surya_render_zoom or extraction_env.get("SURYA_RENDER_ZOOM") or file_env.get("SURYA_RENDER_ZOOM")
tesseract_psm = tesseract_psm or extraction_env.get("TESSERACT_PSM") or file_env.get("TESSERACT_PSM")
previous_engine = main.OCR_ENGINE
previous_env = {
"OCR_RENDER_ZOOM": main.os.getenv("OCR_RENDER_ZOOM"),
"EASYOCR_RENDER_ZOOM": main.os.getenv("EASYOCR_RENDER_ZOOM"),
"QARI_OCR_RENDER_ZOOM": main.os.getenv("QARI_OCR_RENDER_ZOOM"),
"TAWKEED_OCR_RENDER_ZOOM": main.os.getenv("TAWKEED_OCR_RENDER_ZOOM"),
"KATIB_OCR_RENDER_ZOOM": main.os.getenv("KATIB_OCR_RENDER_ZOOM"),
"ARABIC_QWEN_OCR_RENDER_ZOOM": main.os.getenv("ARABIC_QWEN_OCR_RENDER_ZOOM"),
"ARABIC_GLM_OCR_RENDER_ZOOM": main.os.getenv("ARABIC_GLM_OCR_RENDER_ZOOM"),
"BASEER_OCR_RENDER_ZOOM": main.os.getenv("BASEER_OCR_RENDER_ZOOM"),
"PADDLEOCR_RENDER_ZOOM": main.os.getenv("PADDLEOCR_RENDER_ZOOM"),
"PADDLEOCR_VL_RENDER_ZOOM": main.os.getenv("PADDLEOCR_VL_RENDER_ZOOM"),
"SURYA_RENDER_ZOOM": main.os.getenv("SURYA_RENDER_ZOOM"),
"TESSERACT_PSM": main.os.getenv("TESSERACT_PSM"),
}
try:
if ocr_engine is not None:
main.OCR_ENGINE = main.normalize_ocr_engine(ocr_engine)
for key, value in {
"OCR_RENDER_ZOOM": ocr_render_zoom,
"EASYOCR_RENDER_ZOOM": easyocr_render_zoom,
"QARI_OCR_RENDER_ZOOM": qari_ocr_render_zoom,
"TAWKEED_OCR_RENDER_ZOOM": tawkeed_ocr_render_zoom,
"KATIB_OCR_RENDER_ZOOM": katib_ocr_render_zoom,
"ARABIC_QWEN_OCR_RENDER_ZOOM": arabic_qwen_ocr_render_zoom,
"ARABIC_GLM_OCR_RENDER_ZOOM": arabic_glm_ocr_render_zoom,
"BASEER_OCR_RENDER_ZOOM": baseer_ocr_render_zoom,
"PADDLEOCR_RENDER_ZOOM": paddleocr_render_zoom,
"PADDLEOCR_VL_RENDER_ZOOM": paddleocr_vl_render_zoom,
"SURYA_RENDER_ZOOM": surya_render_zoom,
"TESSERACT_PSM": tesseract_psm,
}.items():
if value is not None:
main.os.environ[key] = value
yield
finally:
main.OCR_ENGINE = previous_engine
for key, value in previous_env.items():
if value is None:
main.os.environ.pop(key, None)
else:
main.os.environ[key] = value
def dry_run_pdf(
pdf_path: Path,
chunk_size: int,
ocr_engine: str | None = None,
ocr_render_zoom: str | None = None,
easyocr_render_zoom: str | None = None,
qari_ocr_render_zoom: str | None = None,
tawkeed_ocr_render_zoom: str | None = None,
katib_ocr_render_zoom: str | None = None,
arabic_qwen_ocr_render_zoom: str | None = None,
arabic_glm_ocr_render_zoom: str | None = None,
baseer_ocr_render_zoom: str | None = None,
paddleocr_render_zoom: str | None = None,
paddleocr_vl_render_zoom: str | None = None,
surya_render_zoom: str | None = None,
tesseract_psm: str | None = None,
from_extraction: str | None = None,
env_file: Path | None = None,
include_speech_text: bool = False,
speech_sample_chars: int | None = 1200,
) -> dict[str, object]:
if not pdf_path.exists():
raise FileNotFoundError(f"PDF not found: {pdf_path}")
if pdf_path.suffix.lower() != ".pdf":
raise ValueError("Dry run input must be a PDF file.")
with temporary_ocr_settings(
ocr_engine=ocr_engine,
ocr_render_zoom=ocr_render_zoom,
easyocr_render_zoom=easyocr_render_zoom,
qari_ocr_render_zoom=qari_ocr_render_zoom,
tawkeed_ocr_render_zoom=tawkeed_ocr_render_zoom,
katib_ocr_render_zoom=katib_ocr_render_zoom,
arabic_qwen_ocr_render_zoom=arabic_qwen_ocr_render_zoom,
arabic_glm_ocr_render_zoom=arabic_glm_ocr_render_zoom,
baseer_ocr_render_zoom=baseer_ocr_render_zoom,
paddleocr_render_zoom=paddleocr_render_zoom,
paddleocr_vl_render_zoom=paddleocr_vl_render_zoom,
surya_render_zoom=surya_render_zoom,
tesseract_psm=tesseract_psm,
from_extraction=from_extraction,
env_file=env_file,
):
job = main.Job(id="dry-run", filename=pdf_path.name, ocr_engine=ocr_engine or main.OCR_ENGINE)
text = main.extract_pdf_text(pdf_path, job)
speech_text = main.prepare_text_for_speech(text)
chunks = main.chunk_text(speech_text, chunk_size=chunk_size)
quality = main.assess_text_quality(text, speech_text)
placeholder_count = speech_text.count("?") + speech_text.count("\ufffd")
speech_sample = speech_text
if speech_sample_chars is not None and speech_sample_chars > 0:
speech_sample = speech_text[:speech_sample_chars].rstrip()
result: dict[str, object] = {
"pdf": str(pdf_path),
"pages": job.pages,
"characters": len(text),
"speechCharacters": len(speech_text),
"arabicWords": quality["arabicWords"],
"placeholderCharacters": placeholder_count,
"placeholderRatio": quality["placeholderRatio"],
"singleArabicWords": int(quality["metrics"]["singleArabicWords"]),
"singleArabicWordRatio": quality["metrics"]["singleArabicWordRatio"],
"fragmentLines": int(quality["metrics"]["fragmentLines"]),
"fragmentLineRatio": quality["metrics"]["fragmentLineRatio"],
"quality": quality["quality"],
"qualityScore": quality["score"],
"qualityReasons": quality["reasons"],
"extraction": job.extraction,
"ocrEngine": job.ocr_engine,
"chunks": len(chunks),
"chunkSize": chunk_size,
"largestChunkCharacters": max((len(chunk) for chunk in chunks), default=0),
"textPreview": text[:160],
"speechPreview": speech_text[:160],
"speechSampleText": speech_sample,
"readyForTts": bool(chunks and quality["readyForTts"]),
"ttsWasCalled": False,
}
if include_speech_text:
result["speechText"] = speech_text
return result
def main_cli() -> None:
if hasattr(sys.stdout, "reconfigure"):
sys.stdout.reconfigure(encoding="utf-8", errors="replace")
if hasattr(sys.stderr, "reconfigure"):
sys.stderr.reconfigure(encoding="utf-8", errors="replace")
parser = argparse.ArgumentParser(description="Dry-run Arabic PDF extraction without calling TTS.")
parser.add_argument("pdf", type=Path, help="Path to the PDF to test")
parser.add_argument(
"--chunk-size",
type=int,
default=main.CLOUD_TTS_MAX_CHARS,
help="Maximum characters per simulated TTS chunk",
)
parser.add_argument("--ocr-engine", choices=sorted(main.OCR_ENGINE_CHOICES), help="OCR engine to test.")
parser.add_argument("--ocr-render-zoom", help="Render zoom for Tesseract or shared OCR fallback.")
parser.add_argument("--easyocr-render-zoom", help="Render zoom for EasyOCR.")
parser.add_argument("--qari-ocr-render-zoom", help="Render zoom for QARI-OCR.")
parser.add_argument("--tawkeed-ocr-render-zoom", help="Render zoom for Tawkeed Arabic OCR.")
parser.add_argument("--katib-ocr-render-zoom", help="Render zoom for KATIB Arabic OCR.")
parser.add_argument("--arabic-qwen-ocr-render-zoom", help="Render zoom for Arabic-Qwen3.5 OCR.")
parser.add_argument("--arabic-glm-ocr-render-zoom", help="Render zoom for Arabic-GLM OCR.")
parser.add_argument("--baseer-ocr-render-zoom", help="Render zoom for Baseer Arabic OCR.")
parser.add_argument("--paddleocr-render-zoom", help="Render zoom for PaddleOCR.")
parser.add_argument("--paddleocr-vl-render-zoom", help="Render zoom for PaddleOCR-VL.")
parser.add_argument("--surya-render-zoom", help="Render zoom for Surya OCR.")
parser.add_argument("--tesseract-psm", help="Tesseract page segmentation mode, for example 4 or 6.")
parser.add_argument(
"--from-extraction",
help="Apply settings from a benchmark extraction label, for example best:tesseract@2x-psm4.",
)
parser.add_argument("--env-file", type=Path, help="Load OCR settings from a generated OCR .env snippet.")
parser.add_argument(
"--include-speech-text",
action="store_true",
help="Include the full cleaned speech text in JSON output.",
)
parser.add_argument(
"--speech-sample-chars",
type=int,
default=1200,
help="Maximum cleaned speech characters to include as speechSampleText. Use 0 for no limit.",
)
args = parser.parse_args()
result = dry_run_pdf(
args.pdf,
args.chunk_size,
ocr_engine=args.ocr_engine,
ocr_render_zoom=args.ocr_render_zoom,
easyocr_render_zoom=args.easyocr_render_zoom,
qari_ocr_render_zoom=args.qari_ocr_render_zoom,
tawkeed_ocr_render_zoom=args.tawkeed_ocr_render_zoom,
katib_ocr_render_zoom=args.katib_ocr_render_zoom,
arabic_qwen_ocr_render_zoom=args.arabic_qwen_ocr_render_zoom,
arabic_glm_ocr_render_zoom=args.arabic_glm_ocr_render_zoom,
baseer_ocr_render_zoom=args.baseer_ocr_render_zoom,
paddleocr_render_zoom=args.paddleocr_render_zoom,
paddleocr_vl_render_zoom=args.paddleocr_vl_render_zoom,
surya_render_zoom=args.surya_render_zoom,
tesseract_psm=args.tesseract_psm,
from_extraction=args.from_extraction,
env_file=args.env_file,
include_speech_text=args.include_speech_text,
speech_sample_chars=args.speech_sample_chars,
)
print(json.dumps(result, ensure_ascii=False, indent=2))
if not result["readyForTts"]:
raise SystemExit(1)
if __name__ == "__main__":
main_cli()