| from __future__ import annotations |
|
|
| import argparse |
| import json |
| import sys |
| import tempfile |
| from pathlib import Path |
| from typing import Any |
|
|
| import fitz |
|
|
| ROOT_DIR = Path(__file__).resolve().parent.parent |
| if str(ROOT_DIR) not in sys.path: |
| sys.path.insert(0, str(ROOT_DIR)) |
|
|
| from scripts.benchmark_ocr import benchmark_engine |
| from scripts.benchmark_voices import DEFAULT_VOICES, benchmark_voices as benchmark_voice_set |
| from scripts.dry_run_pdf import dry_run_pdf |
| from scripts.select_test_pages import build_test_pdf |
| from scripts.verify_pipeline import verify_pipeline |
| from app import main |
|
|
|
|
| ENGINE_PRESETS: dict[str, list[str]] = { |
| "practical": ["arabic", "paddleocr", "tesseract"], |
| "balanced": [ |
| "arabic-max", |
| "arabic", |
| "tawkeed-ocr", |
| "katib-ocr", |
| "arabic-qwen-ocr", |
| "arabic-glm-ocr", |
| "baseer-ocr", |
| "paddleocr", |
| "tesseract", |
| ], |
| "maximum": [ |
| "arabic-max", |
| "arabic", |
| "tawkeed-ocr", |
| "katib-ocr", |
| "arabic-qwen-ocr", |
| "arabic-glm-ocr", |
| "baseer-ocr", |
| "qari-ocr", |
| "paddleocr-vl", |
| "paddleocr", |
| "tesseract", |
| ], |
| } |
|
|
|
|
| def choose_best_result(results: list[dict[str, Any]]) -> dict[str, Any] | None: |
| successful = [item for item in results if item.get("ok")] |
| if not successful: |
| return None |
| return max(successful, key=lambda item: (item.get("qualityScore", 0), item.get("arabicWords", 0), -item.get("seconds", 0))) |
|
|
|
|
| def env_text(env: dict[str, str]) -> str: |
| return " ".join(f"{key}={value}" for key, value in env.items()) |
|
|
|
|
| def quote_arg(value: str | Path) -> str: |
| text = str(value) |
| if not text: |
| return '""' |
| if any(char.isspace() for char in text): |
| return f'"{text}"' |
| return text |
|
|
|
|
| def build_commands( |
| pdf_path: Path, |
| extraction: str, |
| voice_id: str, |
| audio_max_chars: int, |
| audio_out: Path | None = None, |
| env_file: Path | None = None, |
| external_tts_out_dir: Path | None = None, |
| external_ocr_out_dir: Path | None = None, |
| ) -> dict[str, str]: |
| pdf = quote_arg(pdf_path) |
| extraction_arg = quote_arg(extraction) |
| settings_arg = f"--env-file {quote_arg(env_file)}" if env_file else f"--from-extraction {extraction_arg}" |
| smoke_out = quote_arg(audio_out or (ROOT_DIR / "outputs" / f"{pdf_path.stem}-sample-smoke.wav")) |
| tts_sample_out = quote_arg(external_tts_out_dir or (ROOT_DIR / "outputs" / "external-tts-sample")) |
| ocr_sample_out = quote_arg(external_ocr_out_dir or (ROOT_DIR / "outputs" / "external-ocr-sample")) |
| return { |
| "dryRunRecommended": f"python scripts\\dry_run_pdf.py {pdf} {settings_arg}", |
| "audioSmokeRecommended": ( |
| f"python scripts\\verify_pipeline.py --pdf {pdf} {settings_arg} " |
| f"--voice-id {voice_id} --max-speech-chars {audio_max_chars} --out {smoke_out}" |
| ), |
| "externalTtsSample": f"python scripts\\export_tts_sample.py {pdf} {settings_arg} --out-dir {tts_sample_out}", |
| "externalOcrSample": f"python scripts\\export_ocr_sample_images.py {pdf} --out-dir {ocr_sample_out}", |
| "fullPipelineRecommended": ( |
| f"python scripts\\verify_pipeline.py --pdf {pdf} {settings_arg} " |
| f"--voice-id {voice_id} --out outputs\\full-book-smoke.wav" |
| ), |
| } |
|
|
|
|
| def seconds_label(seconds: float | None) -> str: |
| if seconds is None: |
| return "unknown" |
| if seconds < 60: |
| return f"{round(seconds, 1)} seconds" |
| minutes = seconds / 60 |
| if minutes < 60: |
| return f"{round(minutes, 1)} minutes" |
| return f"{round(minutes / 60, 1)} hours" |
|
|
|
|
| def estimate_full_book( |
| total_pages: int, |
| sample_page_count: int, |
| selected: dict[str, Any], |
| dry_run: dict[str, Any], |
| audio_smoke: dict[str, Any] | None = None, |
| ) -> dict[str, Any]: |
| sample_page_count = max(sample_page_count, 1) |
| pages_multiplier = total_pages / sample_page_count if total_pages else 0 |
| ocr_seconds_per_page = float(selected.get("seconds") or 0) / sample_page_count |
| speech_chars_per_page = float(dry_run.get("speechCharacters") or 0) / sample_page_count |
| estimated_ocr_seconds = round(ocr_seconds_per_page * total_pages, 2) if total_pages else None |
| estimated_speech_chars = int(round(speech_chars_per_page * total_pages)) if total_pages else None |
|
|
| tts_seconds_per_char = None |
| estimated_tts_seconds = None |
| if audio_smoke: |
| audio_chars = int(audio_smoke.get("audioSpeechCharacters") or 0) |
| elapsed = float(audio_smoke.get("elapsedSeconds") or 0) |
| if audio_chars > 0 and elapsed > 0 and estimated_speech_chars is not None: |
| tts_seconds_per_char = elapsed / audio_chars |
| estimated_tts_seconds = round(tts_seconds_per_char * estimated_speech_chars, 2) |
|
|
| estimated_total_seconds = None |
| if estimated_ocr_seconds is not None: |
| estimated_total_seconds = estimated_ocr_seconds + (estimated_tts_seconds or 0) |
|
|
| return { |
| "basis": "sample", |
| "totalPages": total_pages, |
| "samplePages": sample_page_count, |
| "pagesMultiplier": round(pages_multiplier, 2), |
| "ocrSecondsPerPage": round(ocr_seconds_per_page, 2), |
| "estimatedOcrSeconds": estimated_ocr_seconds, |
| "estimatedOcrTime": seconds_label(estimated_ocr_seconds), |
| "speechCharactersPerPage": round(speech_chars_per_page, 2), |
| "estimatedSpeechCharacters": estimated_speech_chars, |
| "ttsSecondsPerCharacter": round(tts_seconds_per_char, 5) if tts_seconds_per_char is not None else None, |
| "estimatedTtsSeconds": estimated_tts_seconds, |
| "estimatedTtsTime": seconds_label(estimated_tts_seconds), |
| "estimatedTotalSeconds": round(estimated_total_seconds, 2) if estimated_total_seconds is not None else None, |
| "estimatedTotalTime": seconds_label(estimated_total_seconds), |
| "note": "Estimate is based on selected sample pages; dense or scanned pages can vary a lot.", |
| } |
|
|
|
|
| def build_next_steps(result: dict[str, Any]) -> list[str]: |
| steps: list[str] = [] |
| dry_run = result.get("dryRun") or {} |
| estimate = result.get("estimateFullBook") or {} |
| env = result.get("recommendedEnvText") or "" |
| audio = result.get("audioSmoke") |
|
|
| if not dry_run.get("readyForTts"): |
| steps.append("OCR text is not ready for TTS. Try more sample pages, another OCR engine, or higher render zoom before creating audio.") |
| return steps |
|
|
| if dry_run.get("quality") == "warning": |
| reasons = "; ".join(dry_run.get("qualityReasons") or []) |
| detail = f" Warning reasons: {reasons}." if reasons else "" |
| steps.append(f"OCR is usable but should be checked before full-book TTS.{detail}") |
| else: |
| steps.append("OCR quality is usable for TTS on the selected sample.") |
| if env: |
| steps.append(f"Apply these OCR settings for the full book: {env}.") |
| if audio: |
| steps.append("Listen to the audio smoke file before processing the full book.") |
| else: |
| steps.append("Run again with --verify-audio to check pronunciation before processing the full book.") |
| voice_benchmark = result.get("voiceBenchmark") or {} |
| if voice_benchmark.get("ready"): |
| steps.append("Listen to the voice benchmark files and choose the most natural Arabic voice before the full-book run.") |
| elif result.get("voiceBenchmarkRequested"): |
| steps.append("Voice benchmarking did not produce audio. Run scripts/preflight_check.py and install the missing local voice setup.") |
| commands = result.get("commands") or {} |
| if commands.get("dryRunRecommended"): |
| steps.append(f"Recommended dry run command: {commands['dryRunRecommended']}") |
| if commands.get("externalTtsSample"): |
| steps.append(f"External voice comparison sample: {commands['externalTtsSample']}") |
| if commands.get("externalOcrSample"): |
| steps.append(f"External OCR model image sample: {commands['externalOcrSample']}") |
|
|
| total_seconds = estimate.get("estimatedTotalSeconds") |
| tts_seconds = estimate.get("estimatedTtsSeconds") |
| if isinstance(total_seconds, (int, float)) and total_seconds >= 3600: |
| steps.append("Estimated runtime is long. Use the Docker worker or an always-on computer, and process a small sample first.") |
| elif isinstance(total_seconds, (int, float)) and total_seconds >= 600: |
| steps.append("Estimated runtime is more than a few minutes. Keep the browser open or use the worker path for the full book.") |
| if isinstance(tts_seconds, (int, float)) and result.get("smokeVoiceId") == "silma-local" and tts_seconds >= 1800: |
| steps.append("SILMA sounds better but may be slow for the full book. Use --voice-id espeak-ar-clear for a faster fallback smoke test.") |
| return steps |
|
|
|
|
| def resolve_smoke_voice(voice_id: str | None = None) -> str: |
| if voice_id and voice_id != "auto": |
| return voice_id |
| if main.find_silma_python() is not None or main.importlib.util.find_spec("silma_tts") is not None: |
| return "silma-local" |
| if main.find_habibi_python() is not None: |
| return "habibi-msa" |
| if main.find_supertonic_python() is not None or main.importlib.util.find_spec("supertonic") is not None: |
| return "supertonic-ar" |
| if main.find_espeak_ng() is not None: |
| return "espeak-ar-clear" |
| return "silma-local" |
|
|
|
|
| def write_env_snippet(path: Path, result: dict[str, Any]) -> None: |
| env = result.get("recommendedEnv") or {} |
| path.parent.mkdir(parents=True, exist_ok=True) |
| lines = [ |
| "# Arabic PDF Reader OCR settings", |
| "# Generated by scripts/prepare_book_workflow.py", |
| f"# Source PDF: {result.get('pdf', '')}", |
| f"# Sample PDF: {result.get('sample', {}).get('output', '')}", |
| "", |
| ] |
| if not env: |
| lines.append("# No OCR settings were needed for this sample.") |
| else: |
| for key in sorted(env): |
| lines.append(f"{key}={env[key]}") |
| path.write_text("\n".join(lines) + "\n", encoding="utf-8") |
|
|
|
|
| def markdown_value(value: Any) -> str: |
| if value is None or value == "": |
| return "-" |
| if isinstance(value, bool): |
| return "yes" if value else "no" |
| return str(value) |
|
|
|
|
| def fenced_block(language: str, text: str) -> list[str]: |
| return [f"```{language}", text.strip() or "-", "```"] |
|
|
|
|
| def write_markdown_report(path: Path, result: dict[str, Any]) -> None: |
| selected = result.get("selected") or {} |
| sample = result.get("sample") or {} |
| dry_run = result.get("dryRun") or {} |
| audio = result.get("audioSmoke") or {} |
| voice_benchmark = result.get("voiceBenchmark") or {} |
| estimate = result.get("estimateFullBook") or {} |
| commands = result.get("commands") or {} |
| next_steps = result.get("nextSteps") or [] |
|
|
| selected_pages = ", ".join(str(page) for page in sample.get("pages", [])) or "-" |
| command_text = "\n".join(command for command in commands.values() if command) |
| benchmark_lines = [ |
| "| Engine | Quality | Score | Arabic words | Fragment ratio | Extraction | Notes |", |
| "| --- | --- | --- | --- | --- | --- | --- |", |
| ] |
| for item in result.get("benchmark") or []: |
| notes = "; ".join(item.get("qualityReasons") or []) |
| if not item.get("ok"): |
| notes = item.get("error") or "failed" |
| benchmark_lines.append( |
| "| " |
| + " | ".join( |
| [ |
| markdown_value(item.get("engine")), |
| markdown_value(item.get("quality") if item.get("ok") else "failed"), |
| markdown_value(item.get("qualityScore")), |
| markdown_value(item.get("arabicWords")), |
| markdown_value(item.get("fragmentLineRatio")), |
| markdown_value(item.get("extraction")), |
| markdown_value(notes), |
| ] |
| ) |
| + " |" |
| ) |
| lines = [ |
| "# Arabic Audio Preparation Report", |
| "", |
| "## Book", |
| "", |
| f"- PDF: {markdown_value(result.get('pdf'))}", |
| f"- Total pages: {markdown_value(result.get('totalPages'))}", |
| f"- Sample PDF: {markdown_value(sample.get('output'))}", |
| f"- Sample pages: {selected_pages}", |
| f"- Engine preset: {markdown_value(result.get('enginePreset'))}", |
| "", |
| "## Benchmark Results", |
| "", |
| *benchmark_lines, |
| "", |
| "## Selected OCR", |
| "", |
| f"- Extraction: {markdown_value(selected.get('extraction'))}", |
| f"- Engine: {markdown_value(selected.get('engine'))}", |
| f"- Quality score: {markdown_value(selected.get('qualityScore'))}", |
| f"- Arabic words: {markdown_value(selected.get('arabicWords'))}", |
| f"- Sample OCR time: {markdown_value(selected.get('seconds'))} seconds", |
| "", |
| "## Recommended OCR Settings", |
| "", |
| *fenced_block("text", result.get("recommendedEnvText") or "No OCR settings were needed for this sample."), |
| "", |
| "## Dry Run", |
| "", |
| f"- Quality: {markdown_value(dry_run.get('quality'))}", |
| f"- Quality reasons: {markdown_value('; '.join(dry_run.get('qualityReasons') or []))}", |
| f"- Ready for TTS: {markdown_value(dry_run.get('readyForTts'))}", |
| f"- Speech characters: {markdown_value(dry_run.get('speechCharacters'))}", |
| f"- One-letter Arabic word ratio: {markdown_value(dry_run.get('singleArabicWordRatio'))}", |
| f"- Low-information line ratio: {markdown_value(dry_run.get('fragmentLineRatio'))}", |
| f"- Chunks: {markdown_value(dry_run.get('chunks'))}", |
| f"- Extraction: {markdown_value(dry_run.get('extraction'))}", |
| "", |
| ] |
|
|
| if audio: |
| lines.extend( |
| [ |
| "## Audio Smoke", |
| "", |
| f"- Voice: {markdown_value(result.get('smokeVoiceId') or audio.get('voiceId'))}", |
| f"- Engine: {markdown_value(audio.get('engine'))}", |
| f"- Speech characters synthesized: {markdown_value(audio.get('audioSpeechCharacters'))}", |
| f"- Audio seconds: {markdown_value(audio.get('seconds'))}", |
| f"- Output: {markdown_value(audio.get('path'))}", |
| "", |
| ] |
| ) |
| else: |
| lines.extend(["## Audio Smoke", "", "- Not run. Use `--verify-audio` to create a short pronunciation sample.", ""]) |
|
|
| if voice_benchmark: |
| lines.extend( |
| [ |
| "## Voice Benchmark", |
| "", |
| f"- Output directory: {markdown_value(voice_benchmark.get('outputDir'))}", |
| f"- Text characters: {markdown_value(voice_benchmark.get('textCharacters'))}", |
| f"- Audio format: {markdown_value(voice_benchmark.get('audioFormat'))}", |
| f"- Recommended starting voice: {markdown_value((voice_benchmark.get('recommended') or {}).get('voiceId'))}", |
| f"- Fastest successful voice: {markdown_value((voice_benchmark.get('fastest') or {}).get('voiceId'))}", |
| "", |
| "| Voice | Label | Engine | Status | Time | Audio | Notes |", |
| "| --- | --- | --- | --- | ---: | --- | --- |", |
| ] |
| ) |
| for item in voice_benchmark.get("results", []): |
| status = "ok" if item.get("ok") else "failed" |
| elapsed = item.get("elapsedSeconds", "-") |
| audio_path = item.get("path", "-") |
| notes = item.get("error", "") |
| lines.append( |
| f"| {markdown_value(item.get('voiceId'))} | {markdown_value(item.get('label'))} | " |
| f"{markdown_value(item.get('engine'))} | {status} | {elapsed} | {audio_path} | {notes} |" |
| ) |
| lines.append("") |
| elif result.get("voiceBenchmarkRequested"): |
| lines.extend(["## Voice Benchmark", "", "- Not run because no usable cleaned OCR sample was available.", ""]) |
|
|
| lines.extend( |
| [ |
| "## Full Book Estimate", |
| "", |
| f"- Estimated OCR time: {markdown_value(estimate.get('estimatedOcrTime'))}", |
| f"- Estimated TTS time: {markdown_value(estimate.get('estimatedTtsTime'))}", |
| f"- Estimated total time: {markdown_value(estimate.get('estimatedTotalTime'))}", |
| f"- Estimated speech characters: {markdown_value(estimate.get('estimatedSpeechCharacters'))}", |
| f"- Basis: {markdown_value(estimate.get('basis'))}", |
| "", |
| "> Estimates are based on the selected sample pages. Dense scanned pages, marginal scans, and different fonts can change runtime and quality.", |
| "", |
| "## Commands", |
| "", |
| *fenced_block("powershell", command_text), |
| "", |
| "## Next Steps", |
| "", |
| ] |
| ) |
| if next_steps: |
| lines.extend(f"- {step}" for step in next_steps) |
| else: |
| lines.append("- No next steps were generated.") |
| lines.append("") |
|
|
| path.parent.mkdir(parents=True, exist_ok=True) |
| path.write_text("\n".join(lines), encoding="utf-8") |
|
|
|
|
| def prepare_book_workflow( |
| pdf_path: Path, |
| sample_pages: int = 1, |
| skip_first: int = 0, |
| engines: list[str] | None = None, |
| engine_preset: str = "balanced", |
| chunk_size: int = 900, |
| verify_audio: bool = False, |
| voice_id: str | None = "auto", |
| audio_out: Path | None = None, |
| audio_max_chars: int = 1200, |
| benchmark_voices: bool = False, |
| voice_ids: list[str] | None = None, |
| voice_benchmark_out_dir: Path | None = None, |
| voice_benchmark_format: str = "wav", |
| voice_benchmark_max_chars: int | None = None, |
| ) -> dict[str, Any]: |
| if not pdf_path.exists(): |
| raise FileNotFoundError(f"PDF not found: {pdf_path}") |
| if pdf_path.suffix.lower() != ".pdf": |
| raise ValueError("Input must be a PDF file.") |
|
|
| with fitz.open(pdf_path) as document: |
| total_pages = document.page_count |
| if engine_preset not in ENGINE_PRESETS: |
| raise ValueError(f"Unknown engine preset: {engine_preset}") |
| engines = engines or ENGINE_PRESETS[engine_preset] |
| sample_pdf = Path(tempfile.gettempdir()) / f"{pdf_path.stem}-arabic-audio-sample-{sample_pages}.pdf" |
| sample_info = build_test_pdf(pdf_path, sample_pdf, count=sample_pages, skip_first=skip_first) |
|
|
| benchmark_results = [benchmark_engine(sample_pdf, engine) for engine in engines] |
| best = choose_best_result(benchmark_results) |
| if best is None: |
| return { |
| "pdf": str(pdf_path), |
| "sample": sample_info, |
| "benchmark": benchmark_results, |
| "ready": False, |
| "error": "No OCR engine produced usable Arabic text on the sample.", |
| } |
|
|
| extraction = str(best.get("extraction") or "") |
| recommendation = best.get("recommendation") |
| speech_sample_chars = voice_benchmark_max_chars if voice_benchmark_max_chars is not None else audio_max_chars |
| dry_run = dry_run_pdf( |
| sample_pdf, |
| chunk_size=chunk_size, |
| from_extraction=extraction, |
| speech_sample_chars=speech_sample_chars, |
| ) |
| audio_result = None |
| voice_benchmark_result = None |
| resolved_voice_id = resolve_smoke_voice(voice_id) |
| if verify_audio: |
| output = audio_out or (ROOT_DIR / "outputs" / f"{pdf_path.stem}-sample-smoke.wav") |
| audio_result = verify_pipeline( |
| sample_pdf, |
| resolved_voice_id, |
| output, |
| from_extraction=extraction, |
| max_speech_chars=audio_max_chars, |
| ) |
| if benchmark_voices and dry_run.get("readyForTts"): |
| sample_text = str(dry_run.get("speechSampleText") or dry_run.get("speechPreview") or "").strip() |
| if sample_text: |
| output_dir = voice_benchmark_out_dir or (ROOT_DIR / "outputs" / f"{pdf_path.stem}-voice-benchmark") |
| selected_voices = voice_ids or DEFAULT_VOICES |
| voice_benchmark_result = benchmark_voice_set( |
| voices=selected_voices, |
| text=sample_text, |
| output_dir=output_dir, |
| audio_format=voice_benchmark_format, |
| ) |
| estimate = estimate_full_book( |
| total_pages=total_pages, |
| sample_page_count=len(sample_info.get("pages", [])) or sample_pages, |
| selected=best, |
| dry_run=dry_run, |
| audio_smoke=audio_result, |
| ) |
| commands = build_commands( |
| pdf_path=pdf_path, |
| extraction=extraction, |
| voice_id=resolved_voice_id, |
| audio_max_chars=audio_max_chars, |
| audio_out=audio_out, |
| ) |
|
|
| result = { |
| "pdf": str(pdf_path), |
| "totalPages": total_pages, |
| "sample": sample_info, |
| "benchmark": benchmark_results, |
| "enginePreset": engine_preset if engines == ENGINE_PRESETS[engine_preset] else "custom", |
| "selected": best, |
| "recommendation": recommendation, |
| "recommendedEnv": recommendation.get("env", {}) if recommendation else {}, |
| "recommendedEnvText": env_text(recommendation.get("env", {})) if recommendation else "", |
| "dryRun": dry_run, |
| "audioSmoke": audio_result, |
| "smokeVoiceId": resolved_voice_id, |
| "voiceBenchmark": voice_benchmark_result, |
| "voiceBenchmarkRequested": benchmark_voices, |
| "estimateFullBook": estimate, |
| "commands": commands, |
| "ready": bool(dry_run.get("readyForTts") and (audio_result is not None if verify_audio else True)), |
| } |
| result["nextSteps"] = build_next_steps(result) |
| return result |
|
|
|
|
| def print_summary(result: dict[str, Any]) -> None: |
| selected = result.get("selected") or {} |
| recommendation = result.get("recommendation") or {} |
| dry_run = result.get("dryRun") or {} |
| print("Arabic book preparation") |
| print(f"Sample: {result.get('sample', {}).get('output', '-')}") |
| print(f"Selected OCR: {selected.get('extraction', '-')} score={selected.get('qualityScore', '-')}") |
| if recommendation: |
| print(f"Full-book settings: {recommendation.get('summary')}") |
| print( |
| f"Dry run: quality={dry_run.get('quality', '-')} readyForTts={dry_run.get('readyForTts', False)} " |
| f"speechChars={dry_run.get('speechCharacters', 0)}" |
| ) |
| audio = result.get("audioSmoke") |
| if audio: |
| print(f"Audio smoke: {audio.get('path')} {audio.get('seconds')}s {audio.get('bytes')} bytes") |
| voice_benchmark = result.get("voiceBenchmark") or {} |
| if voice_benchmark: |
| successful = [item for item in voice_benchmark.get("results", []) if item.get("ok")] |
| print(f"Voice benchmark: {len(successful)}/{len(voice_benchmark.get('results', []))} voices wrote to {voice_benchmark.get('outputDir')}") |
| estimate = result.get("estimateFullBook") or {} |
| if estimate: |
| print( |
| f"Estimate: OCR {estimate.get('estimatedOcrTime')} " |
| f"TTS {estimate.get('estimatedTtsTime')} total {estimate.get('estimatedTotalTime')}" |
| ) |
| steps = result.get("nextSteps") or [] |
| if steps: |
| print("Next steps:") |
| for step in steps: |
| print(f"- {step}") |
| print(f"Ready: {'yes' if result.get('ready') else 'no'}") |
|
|
|
|
| def main_cli() -> None: |
| if hasattr(sys.stdout, "reconfigure"): |
| sys.stdout.reconfigure(encoding="utf-8", errors="replace") |
| parser = argparse.ArgumentParser(description="Benchmark, dry-run, and optionally audio-smoke an Arabic PDF sample.") |
| parser.add_argument("pdf", type=Path, help="Arabic PDF to prepare") |
| parser.add_argument("--sample-pages", type=int, default=1, help="Number of informative pages to sample.") |
| parser.add_argument("--skip-first", type=int, default=0, help="Ignore the first N pages when selecting sample pages.") |
| parser.add_argument( |
| "--engine-preset", |
| choices=sorted(ENGINE_PRESETS), |
| default="balanced", |
| help="OCR engine preset to use when --engines is not provided. balanced is the recommended free Arabic-trained stack.", |
| ) |
| parser.add_argument( |
| "--engines", |
| nargs="+", |
| default=None, |
| choices=[ |
| "arabic", |
| "arabic-max", |
| "qari-ocr", |
| "tawkeed-ocr", |
| "katib-ocr", |
| "arabic-qwen-ocr", |
| "arabic-glm-ocr", |
| "baseer-ocr", |
| "easyocr", |
| "paddleocr", |
| "paddleocr-vl", |
| "surya", |
| "tesseract", |
| "auto", |
| "best", |
| ], |
| help="OCR engines to benchmark on the sample.", |
| ) |
| parser.add_argument("--chunk-size", type=int, default=900, help="Dry-run chunk size.") |
| parser.add_argument("--verify-audio", action="store_true", help="Also create a short audio smoke test from the sample.") |
| parser.add_argument("--voice-id", default="auto", help="Local voice id for --verify-audio. Use auto to prefer SILMA when installed.") |
| parser.add_argument("--audio-out", type=Path, help="Audio output path for --verify-audio.") |
| parser.add_argument( |
| "--audio-max-chars", |
| type=int, |
| default=1200, |
| help="Maximum cleaned characters to synthesize for --verify-audio.", |
| ) |
| parser.add_argument( |
| "--benchmark-voices", |
| action="store_true", |
| help="Compare local voices using the cleaned OCR sample text.", |
| ) |
| parser.add_argument( |
| "--voices", |
| nargs="+", |
| choices=list(main.LOCAL_VOICES), |
| help="Voice ids to compare with --benchmark-voices.", |
| ) |
| parser.add_argument( |
| "--voice-benchmark-out-dir", |
| type=Path, |
| help="Output directory for --benchmark-voices audio files.", |
| ) |
| parser.add_argument( |
| "--voice-benchmark-format", |
| choices=["wav", "mp3"], |
| default="wav", |
| help="Audio format for --benchmark-voices.", |
| ) |
| parser.add_argument( |
| "--voice-benchmark-max-chars", |
| type=int, |
| help="Maximum cleaned OCR characters to use for --benchmark-voices. Defaults to --audio-max-chars.", |
| ) |
| parser.add_argument("--json", action="store_true", help="Print JSON instead of a compact summary.") |
| parser.add_argument( |
| "--write-env", |
| type=Path, |
| help="Write recommended OCR settings to a small .env snippet. Secrets are never written.", |
| ) |
| parser.add_argument( |
| "--write-report", |
| type=Path, |
| help="Write a readable Markdown report with OCR settings, estimates, commands, and next steps.", |
| ) |
| args = parser.parse_args() |
|
|
| result = prepare_book_workflow( |
| args.pdf, |
| sample_pages=args.sample_pages, |
| skip_first=args.skip_first, |
| engines=args.engines, |
| engine_preset=args.engine_preset, |
| chunk_size=args.chunk_size, |
| verify_audio=args.verify_audio, |
| voice_id=args.voice_id, |
| audio_out=args.audio_out, |
| audio_max_chars=args.audio_max_chars, |
| benchmark_voices=args.benchmark_voices, |
| voice_ids=args.voices, |
| voice_benchmark_out_dir=args.voice_benchmark_out_dir, |
| voice_benchmark_format=args.voice_benchmark_format, |
| voice_benchmark_max_chars=args.voice_benchmark_max_chars, |
| ) |
| if args.write_env: |
| write_env_snippet(args.write_env, result) |
| result["writtenEnv"] = str(args.write_env) |
| result["commands"] = build_commands( |
| pdf_path=args.pdf, |
| extraction=str(result.get("selected", {}).get("extraction") or ""), |
| voice_id=str(result.get("smokeVoiceId") or args.voice_id), |
| audio_max_chars=args.audio_max_chars, |
| audio_out=args.audio_out, |
| env_file=args.write_env, |
| ) |
| result["nextSteps"] = build_next_steps(result) |
| if args.write_report: |
| write_markdown_report(args.write_report, result) |
| result["writtenReport"] = str(args.write_report) |
| if args.json: |
| print(json.dumps(result, ensure_ascii=False, indent=2)) |
| else: |
| print_summary(result) |
| if not result.get("ready"): |
| raise SystemExit(1) |
|
|
|
|
| if __name__ == "__main__": |
| main_cli() |
|
|