| from __future__ import annotations |
|
|
| import argparse |
| from contextlib import contextmanager |
| import json |
| import sys |
| from pathlib import Path |
| from typing import Iterator |
|
|
| ROOT_DIR = Path(__file__).resolve().parent.parent |
| if str(ROOT_DIR) not in sys.path: |
| sys.path.insert(0, str(ROOT_DIR)) |
|
|
| from app import main |
| from scripts.benchmark_ocr import recommendation_for_extraction |
|
|
|
|
| OCR_ENV_KEYS = { |
| "OCR_ENGINE", |
| "OCR_RENDER_ZOOM", |
| "EASYOCR_RENDER_ZOOM", |
| "QARI_OCR_RENDER_ZOOM", |
| "TAWKEED_OCR_RENDER_ZOOM", |
| "KATIB_OCR_RENDER_ZOOM", |
| "ARABIC_QWEN_OCR_RENDER_ZOOM", |
| "ARABIC_GLM_OCR_RENDER_ZOOM", |
| "BASEER_OCR_RENDER_ZOOM", |
| "PADDLEOCR_RENDER_ZOOM", |
| "PADDLEOCR_VL_RENDER_ZOOM", |
| "SURYA_RENDER_ZOOM", |
| "TESSERACT_PSM", |
| } |
|
|
|
|
| def load_ocr_env_file(path: Path | None) -> dict[str, str]: |
| if path is None: |
| return {} |
| if not path.exists(): |
| raise FileNotFoundError(f"OCR env file not found: {path}") |
| values: dict[str, str] = {} |
| for raw_line in path.read_text(encoding="utf-8").splitlines(): |
| line = raw_line.strip() |
| if not line or line.startswith("#") or "=" not in line: |
| continue |
| key, value = line.split("=", 1) |
| key = key.strip() |
| if key in OCR_ENV_KEYS: |
| values[key] = value.strip().strip('"').strip("'") |
| return values |
|
|
|
|
| @contextmanager |
| def temporary_ocr_settings( |
| ocr_engine: str | None = None, |
| ocr_render_zoom: str | None = None, |
| easyocr_render_zoom: str | None = None, |
| qari_ocr_render_zoom: str | None = None, |
| tawkeed_ocr_render_zoom: str | None = None, |
| katib_ocr_render_zoom: str | None = None, |
| arabic_qwen_ocr_render_zoom: str | None = None, |
| arabic_glm_ocr_render_zoom: str | None = None, |
| baseer_ocr_render_zoom: str | None = None, |
| paddleocr_render_zoom: str | None = None, |
| paddleocr_vl_render_zoom: str | None = None, |
| surya_render_zoom: str | None = None, |
| tesseract_psm: str | None = None, |
| from_extraction: str | None = None, |
| env_file: Path | None = None, |
| ) -> Iterator[None]: |
| file_env = load_ocr_env_file(env_file) |
| extraction_env: dict[str, str] = {} |
| if from_extraction: |
| recommendation = recommendation_for_extraction(from_extraction) |
| extraction_env = recommendation.get("env", {}) if recommendation else {} |
| ocr_engine = ocr_engine or extraction_env.get("OCR_ENGINE") or file_env.get("OCR_ENGINE") |
| ocr_render_zoom = ocr_render_zoom or extraction_env.get("OCR_RENDER_ZOOM") or file_env.get("OCR_RENDER_ZOOM") |
| easyocr_render_zoom = easyocr_render_zoom or extraction_env.get("EASYOCR_RENDER_ZOOM") or file_env.get("EASYOCR_RENDER_ZOOM") |
| qari_ocr_render_zoom = qari_ocr_render_zoom or extraction_env.get("QARI_OCR_RENDER_ZOOM") or file_env.get("QARI_OCR_RENDER_ZOOM") |
| tawkeed_ocr_render_zoom = ( |
| tawkeed_ocr_render_zoom |
| or extraction_env.get("TAWKEED_OCR_RENDER_ZOOM") |
| or file_env.get("TAWKEED_OCR_RENDER_ZOOM") |
| ) |
| katib_ocr_render_zoom = ( |
| katib_ocr_render_zoom or extraction_env.get("KATIB_OCR_RENDER_ZOOM") or file_env.get("KATIB_OCR_RENDER_ZOOM") |
| ) |
| arabic_qwen_ocr_render_zoom = ( |
| arabic_qwen_ocr_render_zoom |
| or extraction_env.get("ARABIC_QWEN_OCR_RENDER_ZOOM") |
| or file_env.get("ARABIC_QWEN_OCR_RENDER_ZOOM") |
| ) |
| arabic_glm_ocr_render_zoom = ( |
| arabic_glm_ocr_render_zoom |
| or extraction_env.get("ARABIC_GLM_OCR_RENDER_ZOOM") |
| or file_env.get("ARABIC_GLM_OCR_RENDER_ZOOM") |
| ) |
| baseer_ocr_render_zoom = ( |
| baseer_ocr_render_zoom or extraction_env.get("BASEER_OCR_RENDER_ZOOM") or file_env.get("BASEER_OCR_RENDER_ZOOM") |
| ) |
| paddleocr_render_zoom = paddleocr_render_zoom or extraction_env.get("PADDLEOCR_RENDER_ZOOM") or file_env.get("PADDLEOCR_RENDER_ZOOM") |
| paddleocr_vl_render_zoom = ( |
| paddleocr_vl_render_zoom |
| or extraction_env.get("PADDLEOCR_VL_RENDER_ZOOM") |
| or file_env.get("PADDLEOCR_VL_RENDER_ZOOM") |
| ) |
| surya_render_zoom = surya_render_zoom or extraction_env.get("SURYA_RENDER_ZOOM") or file_env.get("SURYA_RENDER_ZOOM") |
| tesseract_psm = tesseract_psm or extraction_env.get("TESSERACT_PSM") or file_env.get("TESSERACT_PSM") |
|
|
| previous_engine = main.OCR_ENGINE |
| previous_env = { |
| "OCR_RENDER_ZOOM": main.os.getenv("OCR_RENDER_ZOOM"), |
| "EASYOCR_RENDER_ZOOM": main.os.getenv("EASYOCR_RENDER_ZOOM"), |
| "QARI_OCR_RENDER_ZOOM": main.os.getenv("QARI_OCR_RENDER_ZOOM"), |
| "TAWKEED_OCR_RENDER_ZOOM": main.os.getenv("TAWKEED_OCR_RENDER_ZOOM"), |
| "KATIB_OCR_RENDER_ZOOM": main.os.getenv("KATIB_OCR_RENDER_ZOOM"), |
| "ARABIC_QWEN_OCR_RENDER_ZOOM": main.os.getenv("ARABIC_QWEN_OCR_RENDER_ZOOM"), |
| "ARABIC_GLM_OCR_RENDER_ZOOM": main.os.getenv("ARABIC_GLM_OCR_RENDER_ZOOM"), |
| "BASEER_OCR_RENDER_ZOOM": main.os.getenv("BASEER_OCR_RENDER_ZOOM"), |
| "PADDLEOCR_RENDER_ZOOM": main.os.getenv("PADDLEOCR_RENDER_ZOOM"), |
| "PADDLEOCR_VL_RENDER_ZOOM": main.os.getenv("PADDLEOCR_VL_RENDER_ZOOM"), |
| "SURYA_RENDER_ZOOM": main.os.getenv("SURYA_RENDER_ZOOM"), |
| "TESSERACT_PSM": main.os.getenv("TESSERACT_PSM"), |
| } |
| try: |
| if ocr_engine is not None: |
| main.OCR_ENGINE = main.normalize_ocr_engine(ocr_engine) |
| for key, value in { |
| "OCR_RENDER_ZOOM": ocr_render_zoom, |
| "EASYOCR_RENDER_ZOOM": easyocr_render_zoom, |
| "QARI_OCR_RENDER_ZOOM": qari_ocr_render_zoom, |
| "TAWKEED_OCR_RENDER_ZOOM": tawkeed_ocr_render_zoom, |
| "KATIB_OCR_RENDER_ZOOM": katib_ocr_render_zoom, |
| "ARABIC_QWEN_OCR_RENDER_ZOOM": arabic_qwen_ocr_render_zoom, |
| "ARABIC_GLM_OCR_RENDER_ZOOM": arabic_glm_ocr_render_zoom, |
| "BASEER_OCR_RENDER_ZOOM": baseer_ocr_render_zoom, |
| "PADDLEOCR_RENDER_ZOOM": paddleocr_render_zoom, |
| "PADDLEOCR_VL_RENDER_ZOOM": paddleocr_vl_render_zoom, |
| "SURYA_RENDER_ZOOM": surya_render_zoom, |
| "TESSERACT_PSM": tesseract_psm, |
| }.items(): |
| if value is not None: |
| main.os.environ[key] = value |
| yield |
| finally: |
| main.OCR_ENGINE = previous_engine |
| for key, value in previous_env.items(): |
| if value is None: |
| main.os.environ.pop(key, None) |
| else: |
| main.os.environ[key] = value |
|
|
|
|
| def dry_run_pdf( |
| pdf_path: Path, |
| chunk_size: int, |
| ocr_engine: str | None = None, |
| ocr_render_zoom: str | None = None, |
| easyocr_render_zoom: str | None = None, |
| qari_ocr_render_zoom: str | None = None, |
| tawkeed_ocr_render_zoom: str | None = None, |
| katib_ocr_render_zoom: str | None = None, |
| arabic_qwen_ocr_render_zoom: str | None = None, |
| arabic_glm_ocr_render_zoom: str | None = None, |
| baseer_ocr_render_zoom: str | None = None, |
| paddleocr_render_zoom: str | None = None, |
| paddleocr_vl_render_zoom: str | None = None, |
| surya_render_zoom: str | None = None, |
| tesseract_psm: str | None = None, |
| from_extraction: str | None = None, |
| env_file: Path | None = None, |
| include_speech_text: bool = False, |
| speech_sample_chars: int | None = 1200, |
| ) -> dict[str, object]: |
| if not pdf_path.exists(): |
| raise FileNotFoundError(f"PDF not found: {pdf_path}") |
| if pdf_path.suffix.lower() != ".pdf": |
| raise ValueError("Dry run input must be a PDF file.") |
|
|
| with temporary_ocr_settings( |
| ocr_engine=ocr_engine, |
| ocr_render_zoom=ocr_render_zoom, |
| easyocr_render_zoom=easyocr_render_zoom, |
| qari_ocr_render_zoom=qari_ocr_render_zoom, |
| tawkeed_ocr_render_zoom=tawkeed_ocr_render_zoom, |
| katib_ocr_render_zoom=katib_ocr_render_zoom, |
| arabic_qwen_ocr_render_zoom=arabic_qwen_ocr_render_zoom, |
| arabic_glm_ocr_render_zoom=arabic_glm_ocr_render_zoom, |
| baseer_ocr_render_zoom=baseer_ocr_render_zoom, |
| paddleocr_render_zoom=paddleocr_render_zoom, |
| paddleocr_vl_render_zoom=paddleocr_vl_render_zoom, |
| surya_render_zoom=surya_render_zoom, |
| tesseract_psm=tesseract_psm, |
| from_extraction=from_extraction, |
| env_file=env_file, |
| ): |
| job = main.Job(id="dry-run", filename=pdf_path.name, ocr_engine=ocr_engine or main.OCR_ENGINE) |
| text = main.extract_pdf_text(pdf_path, job) |
| speech_text = main.prepare_text_for_speech(text) |
| chunks = main.chunk_text(speech_text, chunk_size=chunk_size) |
| quality = main.assess_text_quality(text, speech_text) |
| placeholder_count = speech_text.count("?") + speech_text.count("\ufffd") |
| speech_sample = speech_text |
| if speech_sample_chars is not None and speech_sample_chars > 0: |
| speech_sample = speech_text[:speech_sample_chars].rstrip() |
| result: dict[str, object] = { |
| "pdf": str(pdf_path), |
| "pages": job.pages, |
| "characters": len(text), |
| "speechCharacters": len(speech_text), |
| "arabicWords": quality["arabicWords"], |
| "placeholderCharacters": placeholder_count, |
| "placeholderRatio": quality["placeholderRatio"], |
| "singleArabicWords": int(quality["metrics"]["singleArabicWords"]), |
| "singleArabicWordRatio": quality["metrics"]["singleArabicWordRatio"], |
| "fragmentLines": int(quality["metrics"]["fragmentLines"]), |
| "fragmentLineRatio": quality["metrics"]["fragmentLineRatio"], |
| "quality": quality["quality"], |
| "qualityScore": quality["score"], |
| "qualityReasons": quality["reasons"], |
| "extraction": job.extraction, |
| "ocrEngine": job.ocr_engine, |
| "chunks": len(chunks), |
| "chunkSize": chunk_size, |
| "largestChunkCharacters": max((len(chunk) for chunk in chunks), default=0), |
| "textPreview": text[:160], |
| "speechPreview": speech_text[:160], |
| "speechSampleText": speech_sample, |
| "readyForTts": bool(chunks and quality["readyForTts"]), |
| "ttsWasCalled": False, |
| } |
| if include_speech_text: |
| result["speechText"] = speech_text |
| return result |
|
|
|
|
| def main_cli() -> None: |
| if hasattr(sys.stdout, "reconfigure"): |
| sys.stdout.reconfigure(encoding="utf-8", errors="replace") |
| if hasattr(sys.stderr, "reconfigure"): |
| sys.stderr.reconfigure(encoding="utf-8", errors="replace") |
|
|
| parser = argparse.ArgumentParser(description="Dry-run Arabic PDF extraction without calling TTS.") |
| parser.add_argument("pdf", type=Path, help="Path to the PDF to test") |
| parser.add_argument( |
| "--chunk-size", |
| type=int, |
| default=main.CLOUD_TTS_MAX_CHARS, |
| help="Maximum characters per simulated TTS chunk", |
| ) |
| parser.add_argument("--ocr-engine", choices=sorted(main.OCR_ENGINE_CHOICES), help="OCR engine to test.") |
| parser.add_argument("--ocr-render-zoom", help="Render zoom for Tesseract or shared OCR fallback.") |
| parser.add_argument("--easyocr-render-zoom", help="Render zoom for EasyOCR.") |
| parser.add_argument("--qari-ocr-render-zoom", help="Render zoom for QARI-OCR.") |
| parser.add_argument("--tawkeed-ocr-render-zoom", help="Render zoom for Tawkeed Arabic OCR.") |
| parser.add_argument("--katib-ocr-render-zoom", help="Render zoom for KATIB Arabic OCR.") |
| parser.add_argument("--arabic-qwen-ocr-render-zoom", help="Render zoom for Arabic-Qwen3.5 OCR.") |
| parser.add_argument("--arabic-glm-ocr-render-zoom", help="Render zoom for Arabic-GLM OCR.") |
| parser.add_argument("--baseer-ocr-render-zoom", help="Render zoom for Baseer Arabic OCR.") |
| parser.add_argument("--paddleocr-render-zoom", help="Render zoom for PaddleOCR.") |
| parser.add_argument("--paddleocr-vl-render-zoom", help="Render zoom for PaddleOCR-VL.") |
| parser.add_argument("--surya-render-zoom", help="Render zoom for Surya OCR.") |
| parser.add_argument("--tesseract-psm", help="Tesseract page segmentation mode, for example 4 or 6.") |
| parser.add_argument( |
| "--from-extraction", |
| help="Apply settings from a benchmark extraction label, for example best:tesseract@2x-psm4.", |
| ) |
| parser.add_argument("--env-file", type=Path, help="Load OCR settings from a generated OCR .env snippet.") |
| parser.add_argument( |
| "--include-speech-text", |
| action="store_true", |
| help="Include the full cleaned speech text in JSON output.", |
| ) |
| parser.add_argument( |
| "--speech-sample-chars", |
| type=int, |
| default=1200, |
| help="Maximum cleaned speech characters to include as speechSampleText. Use 0 for no limit.", |
| ) |
| args = parser.parse_args() |
| result = dry_run_pdf( |
| args.pdf, |
| args.chunk_size, |
| ocr_engine=args.ocr_engine, |
| ocr_render_zoom=args.ocr_render_zoom, |
| easyocr_render_zoom=args.easyocr_render_zoom, |
| qari_ocr_render_zoom=args.qari_ocr_render_zoom, |
| tawkeed_ocr_render_zoom=args.tawkeed_ocr_render_zoom, |
| katib_ocr_render_zoom=args.katib_ocr_render_zoom, |
| arabic_qwen_ocr_render_zoom=args.arabic_qwen_ocr_render_zoom, |
| arabic_glm_ocr_render_zoom=args.arabic_glm_ocr_render_zoom, |
| baseer_ocr_render_zoom=args.baseer_ocr_render_zoom, |
| paddleocr_render_zoom=args.paddleocr_render_zoom, |
| paddleocr_vl_render_zoom=args.paddleocr_vl_render_zoom, |
| surya_render_zoom=args.surya_render_zoom, |
| tesseract_psm=args.tesseract_psm, |
| from_extraction=args.from_extraction, |
| env_file=args.env_file, |
| include_speech_text=args.include_speech_text, |
| speech_sample_chars=args.speech_sample_chars, |
| ) |
| print(json.dumps(result, ensure_ascii=False, indent=2)) |
| if not result["readyForTts"]: |
| raise SystemExit(1) |
|
|
|
|
| if __name__ == "__main__": |
| main_cli() |
|
|