from __future__ import annotations import argparse from contextlib import contextmanager import json import sys from pathlib import Path from typing import Iterator ROOT_DIR = Path(__file__).resolve().parent.parent if str(ROOT_DIR) not in sys.path: sys.path.insert(0, str(ROOT_DIR)) from app import main from scripts.benchmark_ocr import recommendation_for_extraction OCR_ENV_KEYS = { "OCR_ENGINE", "OCR_RENDER_ZOOM", "EASYOCR_RENDER_ZOOM", "QARI_OCR_RENDER_ZOOM", "TAWKEED_OCR_RENDER_ZOOM", "KATIB_OCR_RENDER_ZOOM", "ARABIC_QWEN_OCR_RENDER_ZOOM", "ARABIC_GLM_OCR_RENDER_ZOOM", "BASEER_OCR_RENDER_ZOOM", "PADDLEOCR_RENDER_ZOOM", "PADDLEOCR_VL_RENDER_ZOOM", "SURYA_RENDER_ZOOM", "TESSERACT_PSM", } def load_ocr_env_file(path: Path | None) -> dict[str, str]: if path is None: return {} if not path.exists(): raise FileNotFoundError(f"OCR env file not found: {path}") values: dict[str, str] = {} for raw_line in path.read_text(encoding="utf-8").splitlines(): line = raw_line.strip() if not line or line.startswith("#") or "=" not in line: continue key, value = line.split("=", 1) key = key.strip() if key in OCR_ENV_KEYS: values[key] = value.strip().strip('"').strip("'") return values @contextmanager def temporary_ocr_settings( ocr_engine: str | None = None, ocr_render_zoom: str | None = None, easyocr_render_zoom: str | None = None, qari_ocr_render_zoom: str | None = None, tawkeed_ocr_render_zoom: str | None = None, katib_ocr_render_zoom: str | None = None, arabic_qwen_ocr_render_zoom: str | None = None, arabic_glm_ocr_render_zoom: str | None = None, baseer_ocr_render_zoom: str | None = None, paddleocr_render_zoom: str | None = None, paddleocr_vl_render_zoom: str | None = None, surya_render_zoom: str | None = None, tesseract_psm: str | None = None, from_extraction: str | None = None, env_file: Path | None = None, ) -> Iterator[None]: file_env = load_ocr_env_file(env_file) extraction_env: dict[str, str] = {} if from_extraction: recommendation = recommendation_for_extraction(from_extraction) extraction_env = recommendation.get("env", {}) if recommendation else {} ocr_engine = ocr_engine or extraction_env.get("OCR_ENGINE") or file_env.get("OCR_ENGINE") ocr_render_zoom = ocr_render_zoom or extraction_env.get("OCR_RENDER_ZOOM") or file_env.get("OCR_RENDER_ZOOM") easyocr_render_zoom = easyocr_render_zoom or extraction_env.get("EASYOCR_RENDER_ZOOM") or file_env.get("EASYOCR_RENDER_ZOOM") qari_ocr_render_zoom = qari_ocr_render_zoom or extraction_env.get("QARI_OCR_RENDER_ZOOM") or file_env.get("QARI_OCR_RENDER_ZOOM") tawkeed_ocr_render_zoom = ( tawkeed_ocr_render_zoom or extraction_env.get("TAWKEED_OCR_RENDER_ZOOM") or file_env.get("TAWKEED_OCR_RENDER_ZOOM") ) katib_ocr_render_zoom = ( katib_ocr_render_zoom or extraction_env.get("KATIB_OCR_RENDER_ZOOM") or file_env.get("KATIB_OCR_RENDER_ZOOM") ) arabic_qwen_ocr_render_zoom = ( arabic_qwen_ocr_render_zoom or extraction_env.get("ARABIC_QWEN_OCR_RENDER_ZOOM") or file_env.get("ARABIC_QWEN_OCR_RENDER_ZOOM") ) arabic_glm_ocr_render_zoom = ( arabic_glm_ocr_render_zoom or extraction_env.get("ARABIC_GLM_OCR_RENDER_ZOOM") or file_env.get("ARABIC_GLM_OCR_RENDER_ZOOM") ) baseer_ocr_render_zoom = ( baseer_ocr_render_zoom or extraction_env.get("BASEER_OCR_RENDER_ZOOM") or file_env.get("BASEER_OCR_RENDER_ZOOM") ) paddleocr_render_zoom = paddleocr_render_zoom or extraction_env.get("PADDLEOCR_RENDER_ZOOM") or file_env.get("PADDLEOCR_RENDER_ZOOM") paddleocr_vl_render_zoom = ( paddleocr_vl_render_zoom or extraction_env.get("PADDLEOCR_VL_RENDER_ZOOM") or file_env.get("PADDLEOCR_VL_RENDER_ZOOM") ) surya_render_zoom = surya_render_zoom or extraction_env.get("SURYA_RENDER_ZOOM") or file_env.get("SURYA_RENDER_ZOOM") tesseract_psm = tesseract_psm or extraction_env.get("TESSERACT_PSM") or file_env.get("TESSERACT_PSM") previous_engine = main.OCR_ENGINE previous_env = { "OCR_RENDER_ZOOM": main.os.getenv("OCR_RENDER_ZOOM"), "EASYOCR_RENDER_ZOOM": main.os.getenv("EASYOCR_RENDER_ZOOM"), "QARI_OCR_RENDER_ZOOM": main.os.getenv("QARI_OCR_RENDER_ZOOM"), "TAWKEED_OCR_RENDER_ZOOM": main.os.getenv("TAWKEED_OCR_RENDER_ZOOM"), "KATIB_OCR_RENDER_ZOOM": main.os.getenv("KATIB_OCR_RENDER_ZOOM"), "ARABIC_QWEN_OCR_RENDER_ZOOM": main.os.getenv("ARABIC_QWEN_OCR_RENDER_ZOOM"), "ARABIC_GLM_OCR_RENDER_ZOOM": main.os.getenv("ARABIC_GLM_OCR_RENDER_ZOOM"), "BASEER_OCR_RENDER_ZOOM": main.os.getenv("BASEER_OCR_RENDER_ZOOM"), "PADDLEOCR_RENDER_ZOOM": main.os.getenv("PADDLEOCR_RENDER_ZOOM"), "PADDLEOCR_VL_RENDER_ZOOM": main.os.getenv("PADDLEOCR_VL_RENDER_ZOOM"), "SURYA_RENDER_ZOOM": main.os.getenv("SURYA_RENDER_ZOOM"), "TESSERACT_PSM": main.os.getenv("TESSERACT_PSM"), } try: if ocr_engine is not None: main.OCR_ENGINE = main.normalize_ocr_engine(ocr_engine) for key, value in { "OCR_RENDER_ZOOM": ocr_render_zoom, "EASYOCR_RENDER_ZOOM": easyocr_render_zoom, "QARI_OCR_RENDER_ZOOM": qari_ocr_render_zoom, "TAWKEED_OCR_RENDER_ZOOM": tawkeed_ocr_render_zoom, "KATIB_OCR_RENDER_ZOOM": katib_ocr_render_zoom, "ARABIC_QWEN_OCR_RENDER_ZOOM": arabic_qwen_ocr_render_zoom, "ARABIC_GLM_OCR_RENDER_ZOOM": arabic_glm_ocr_render_zoom, "BASEER_OCR_RENDER_ZOOM": baseer_ocr_render_zoom, "PADDLEOCR_RENDER_ZOOM": paddleocr_render_zoom, "PADDLEOCR_VL_RENDER_ZOOM": paddleocr_vl_render_zoom, "SURYA_RENDER_ZOOM": surya_render_zoom, "TESSERACT_PSM": tesseract_psm, }.items(): if value is not None: main.os.environ[key] = value yield finally: main.OCR_ENGINE = previous_engine for key, value in previous_env.items(): if value is None: main.os.environ.pop(key, None) else: main.os.environ[key] = value def dry_run_pdf( pdf_path: Path, chunk_size: int, ocr_engine: str | None = None, ocr_render_zoom: str | None = None, easyocr_render_zoom: str | None = None, qari_ocr_render_zoom: str | None = None, tawkeed_ocr_render_zoom: str | None = None, katib_ocr_render_zoom: str | None = None, arabic_qwen_ocr_render_zoom: str | None = None, arabic_glm_ocr_render_zoom: str | None = None, baseer_ocr_render_zoom: str | None = None, paddleocr_render_zoom: str | None = None, paddleocr_vl_render_zoom: str | None = None, surya_render_zoom: str | None = None, tesseract_psm: str | None = None, from_extraction: str | None = None, env_file: Path | None = None, include_speech_text: bool = False, speech_sample_chars: int | None = 1200, ) -> dict[str, object]: if not pdf_path.exists(): raise FileNotFoundError(f"PDF not found: {pdf_path}") if pdf_path.suffix.lower() != ".pdf": raise ValueError("Dry run input must be a PDF file.") with temporary_ocr_settings( ocr_engine=ocr_engine, ocr_render_zoom=ocr_render_zoom, easyocr_render_zoom=easyocr_render_zoom, qari_ocr_render_zoom=qari_ocr_render_zoom, tawkeed_ocr_render_zoom=tawkeed_ocr_render_zoom, katib_ocr_render_zoom=katib_ocr_render_zoom, arabic_qwen_ocr_render_zoom=arabic_qwen_ocr_render_zoom, arabic_glm_ocr_render_zoom=arabic_glm_ocr_render_zoom, baseer_ocr_render_zoom=baseer_ocr_render_zoom, paddleocr_render_zoom=paddleocr_render_zoom, paddleocr_vl_render_zoom=paddleocr_vl_render_zoom, surya_render_zoom=surya_render_zoom, tesseract_psm=tesseract_psm, from_extraction=from_extraction, env_file=env_file, ): job = main.Job(id="dry-run", filename=pdf_path.name, ocr_engine=ocr_engine or main.OCR_ENGINE) text = main.extract_pdf_text(pdf_path, job) speech_text = main.prepare_text_for_speech(text) chunks = main.chunk_text(speech_text, chunk_size=chunk_size) quality = main.assess_text_quality(text, speech_text) placeholder_count = speech_text.count("?") + speech_text.count("\ufffd") speech_sample = speech_text if speech_sample_chars is not None and speech_sample_chars > 0: speech_sample = speech_text[:speech_sample_chars].rstrip() result: dict[str, object] = { "pdf": str(pdf_path), "pages": job.pages, "characters": len(text), "speechCharacters": len(speech_text), "arabicWords": quality["arabicWords"], "placeholderCharacters": placeholder_count, "placeholderRatio": quality["placeholderRatio"], "singleArabicWords": int(quality["metrics"]["singleArabicWords"]), "singleArabicWordRatio": quality["metrics"]["singleArabicWordRatio"], "fragmentLines": int(quality["metrics"]["fragmentLines"]), "fragmentLineRatio": quality["metrics"]["fragmentLineRatio"], "quality": quality["quality"], "qualityScore": quality["score"], "qualityReasons": quality["reasons"], "extraction": job.extraction, "ocrEngine": job.ocr_engine, "chunks": len(chunks), "chunkSize": chunk_size, "largestChunkCharacters": max((len(chunk) for chunk in chunks), default=0), "textPreview": text[:160], "speechPreview": speech_text[:160], "speechSampleText": speech_sample, "readyForTts": bool(chunks and quality["readyForTts"]), "ttsWasCalled": False, } if include_speech_text: result["speechText"] = speech_text return result def main_cli() -> None: if hasattr(sys.stdout, "reconfigure"): sys.stdout.reconfigure(encoding="utf-8", errors="replace") if hasattr(sys.stderr, "reconfigure"): sys.stderr.reconfigure(encoding="utf-8", errors="replace") parser = argparse.ArgumentParser(description="Dry-run Arabic PDF extraction without calling TTS.") parser.add_argument("pdf", type=Path, help="Path to the PDF to test") parser.add_argument( "--chunk-size", type=int, default=main.CLOUD_TTS_MAX_CHARS, help="Maximum characters per simulated TTS chunk", ) parser.add_argument("--ocr-engine", choices=sorted(main.OCR_ENGINE_CHOICES), help="OCR engine to test.") parser.add_argument("--ocr-render-zoom", help="Render zoom for Tesseract or shared OCR fallback.") parser.add_argument("--easyocr-render-zoom", help="Render zoom for EasyOCR.") parser.add_argument("--qari-ocr-render-zoom", help="Render zoom for QARI-OCR.") parser.add_argument("--tawkeed-ocr-render-zoom", help="Render zoom for Tawkeed Arabic OCR.") parser.add_argument("--katib-ocr-render-zoom", help="Render zoom for KATIB Arabic OCR.") parser.add_argument("--arabic-qwen-ocr-render-zoom", help="Render zoom for Arabic-Qwen3.5 OCR.") parser.add_argument("--arabic-glm-ocr-render-zoom", help="Render zoom for Arabic-GLM OCR.") parser.add_argument("--baseer-ocr-render-zoom", help="Render zoom for Baseer Arabic OCR.") parser.add_argument("--paddleocr-render-zoom", help="Render zoom for PaddleOCR.") parser.add_argument("--paddleocr-vl-render-zoom", help="Render zoom for PaddleOCR-VL.") parser.add_argument("--surya-render-zoom", help="Render zoom for Surya OCR.") parser.add_argument("--tesseract-psm", help="Tesseract page segmentation mode, for example 4 or 6.") parser.add_argument( "--from-extraction", help="Apply settings from a benchmark extraction label, for example best:tesseract@2x-psm4.", ) parser.add_argument("--env-file", type=Path, help="Load OCR settings from a generated OCR .env snippet.") parser.add_argument( "--include-speech-text", action="store_true", help="Include the full cleaned speech text in JSON output.", ) parser.add_argument( "--speech-sample-chars", type=int, default=1200, help="Maximum cleaned speech characters to include as speechSampleText. Use 0 for no limit.", ) args = parser.parse_args() result = dry_run_pdf( args.pdf, args.chunk_size, ocr_engine=args.ocr_engine, ocr_render_zoom=args.ocr_render_zoom, easyocr_render_zoom=args.easyocr_render_zoom, qari_ocr_render_zoom=args.qari_ocr_render_zoom, tawkeed_ocr_render_zoom=args.tawkeed_ocr_render_zoom, katib_ocr_render_zoom=args.katib_ocr_render_zoom, arabic_qwen_ocr_render_zoom=args.arabic_qwen_ocr_render_zoom, arabic_glm_ocr_render_zoom=args.arabic_glm_ocr_render_zoom, baseer_ocr_render_zoom=args.baseer_ocr_render_zoom, paddleocr_render_zoom=args.paddleocr_render_zoom, paddleocr_vl_render_zoom=args.paddleocr_vl_render_zoom, surya_render_zoom=args.surya_render_zoom, tesseract_psm=args.tesseract_psm, from_extraction=args.from_extraction, env_file=args.env_file, include_speech_text=args.include_speech_text, speech_sample_chars=args.speech_sample_chars, ) print(json.dumps(result, ensure_ascii=False, indent=2)) if not result["readyForTts"]: raise SystemExit(1) if __name__ == "__main__": main_cli()