from __future__ import annotations import argparse import json import sys import tempfile from pathlib import Path from typing import Any import fitz ROOT_DIR = Path(__file__).resolve().parent.parent if str(ROOT_DIR) not in sys.path: sys.path.insert(0, str(ROOT_DIR)) from scripts.benchmark_ocr import benchmark_engine from scripts.benchmark_voices import DEFAULT_VOICES, benchmark_voices as benchmark_voice_set from scripts.dry_run_pdf import dry_run_pdf from scripts.select_test_pages import build_test_pdf from scripts.verify_pipeline import verify_pipeline from app import main ENGINE_PRESETS: dict[str, list[str]] = { "practical": ["arabic", "paddleocr", "tesseract"], "balanced": [ "arabic-max", "arabic", "tawkeed-ocr", "katib-ocr", "arabic-qwen-ocr", "arabic-glm-ocr", "baseer-ocr", "paddleocr", "tesseract", ], "maximum": [ "arabic-max", "arabic", "tawkeed-ocr", "katib-ocr", "arabic-qwen-ocr", "arabic-glm-ocr", "baseer-ocr", "qari-ocr", "paddleocr-vl", "paddleocr", "tesseract", ], } def choose_best_result(results: list[dict[str, Any]]) -> dict[str, Any] | None: successful = [item for item in results if item.get("ok")] if not successful: return None return max(successful, key=lambda item: (item.get("qualityScore", 0), item.get("arabicWords", 0), -item.get("seconds", 0))) def env_text(env: dict[str, str]) -> str: return " ".join(f"{key}={value}" for key, value in env.items()) def quote_arg(value: str | Path) -> str: text = str(value) if not text: return '""' if any(char.isspace() for char in text): return f'"{text}"' return text def build_commands( pdf_path: Path, extraction: str, voice_id: str, audio_max_chars: int, audio_out: Path | None = None, env_file: Path | None = None, external_tts_out_dir: Path | None = None, external_ocr_out_dir: Path | None = None, ) -> dict[str, str]: pdf = quote_arg(pdf_path) extraction_arg = quote_arg(extraction) settings_arg = f"--env-file {quote_arg(env_file)}" if env_file else f"--from-extraction {extraction_arg}" smoke_out = quote_arg(audio_out or (ROOT_DIR / "outputs" / f"{pdf_path.stem}-sample-smoke.wav")) tts_sample_out = quote_arg(external_tts_out_dir or (ROOT_DIR / "outputs" / "external-tts-sample")) ocr_sample_out = quote_arg(external_ocr_out_dir or (ROOT_DIR / "outputs" / "external-ocr-sample")) return { "dryRunRecommended": f"python scripts\\dry_run_pdf.py {pdf} {settings_arg}", "audioSmokeRecommended": ( f"python scripts\\verify_pipeline.py --pdf {pdf} {settings_arg} " f"--voice-id {voice_id} --max-speech-chars {audio_max_chars} --out {smoke_out}" ), "externalTtsSample": f"python scripts\\export_tts_sample.py {pdf} {settings_arg} --out-dir {tts_sample_out}", "externalOcrSample": f"python scripts\\export_ocr_sample_images.py {pdf} --out-dir {ocr_sample_out}", "fullPipelineRecommended": ( f"python scripts\\verify_pipeline.py --pdf {pdf} {settings_arg} " f"--voice-id {voice_id} --out outputs\\full-book-smoke.wav" ), } def seconds_label(seconds: float | None) -> str: if seconds is None: return "unknown" if seconds < 60: return f"{round(seconds, 1)} seconds" minutes = seconds / 60 if minutes < 60: return f"{round(minutes, 1)} minutes" return f"{round(minutes / 60, 1)} hours" def estimate_full_book( total_pages: int, sample_page_count: int, selected: dict[str, Any], dry_run: dict[str, Any], audio_smoke: dict[str, Any] | None = None, ) -> dict[str, Any]: sample_page_count = max(sample_page_count, 1) pages_multiplier = total_pages / sample_page_count if total_pages else 0 ocr_seconds_per_page = float(selected.get("seconds") or 0) / sample_page_count speech_chars_per_page = float(dry_run.get("speechCharacters") or 0) / sample_page_count estimated_ocr_seconds = round(ocr_seconds_per_page * total_pages, 2) if total_pages else None estimated_speech_chars = int(round(speech_chars_per_page * total_pages)) if total_pages else None tts_seconds_per_char = None estimated_tts_seconds = None if audio_smoke: audio_chars = int(audio_smoke.get("audioSpeechCharacters") or 0) elapsed = float(audio_smoke.get("elapsedSeconds") or 0) if audio_chars > 0 and elapsed > 0 and estimated_speech_chars is not None: tts_seconds_per_char = elapsed / audio_chars estimated_tts_seconds = round(tts_seconds_per_char * estimated_speech_chars, 2) estimated_total_seconds = None if estimated_ocr_seconds is not None: estimated_total_seconds = estimated_ocr_seconds + (estimated_tts_seconds or 0) return { "basis": "sample", "totalPages": total_pages, "samplePages": sample_page_count, "pagesMultiplier": round(pages_multiplier, 2), "ocrSecondsPerPage": round(ocr_seconds_per_page, 2), "estimatedOcrSeconds": estimated_ocr_seconds, "estimatedOcrTime": seconds_label(estimated_ocr_seconds), "speechCharactersPerPage": round(speech_chars_per_page, 2), "estimatedSpeechCharacters": estimated_speech_chars, "ttsSecondsPerCharacter": round(tts_seconds_per_char, 5) if tts_seconds_per_char is not None else None, "estimatedTtsSeconds": estimated_tts_seconds, "estimatedTtsTime": seconds_label(estimated_tts_seconds), "estimatedTotalSeconds": round(estimated_total_seconds, 2) if estimated_total_seconds is not None else None, "estimatedTotalTime": seconds_label(estimated_total_seconds), "note": "Estimate is based on selected sample pages; dense or scanned pages can vary a lot.", } def build_next_steps(result: dict[str, Any]) -> list[str]: steps: list[str] = [] dry_run = result.get("dryRun") or {} estimate = result.get("estimateFullBook") or {} env = result.get("recommendedEnvText") or "" audio = result.get("audioSmoke") if not dry_run.get("readyForTts"): steps.append("OCR text is not ready for TTS. Try more sample pages, another OCR engine, or higher render zoom before creating audio.") return steps if dry_run.get("quality") == "warning": reasons = "; ".join(dry_run.get("qualityReasons") or []) detail = f" Warning reasons: {reasons}." if reasons else "" steps.append(f"OCR is usable but should be checked before full-book TTS.{detail}") else: steps.append("OCR quality is usable for TTS on the selected sample.") if env: steps.append(f"Apply these OCR settings for the full book: {env}.") if audio: steps.append("Listen to the audio smoke file before processing the full book.") else: steps.append("Run again with --verify-audio to check pronunciation before processing the full book.") voice_benchmark = result.get("voiceBenchmark") or {} if voice_benchmark.get("ready"): steps.append("Listen to the voice benchmark files and choose the most natural Arabic voice before the full-book run.") elif result.get("voiceBenchmarkRequested"): steps.append("Voice benchmarking did not produce audio. Run scripts/preflight_check.py and install the missing local voice setup.") commands = result.get("commands") or {} if commands.get("dryRunRecommended"): steps.append(f"Recommended dry run command: {commands['dryRunRecommended']}") if commands.get("externalTtsSample"): steps.append(f"External voice comparison sample: {commands['externalTtsSample']}") if commands.get("externalOcrSample"): steps.append(f"External OCR model image sample: {commands['externalOcrSample']}") total_seconds = estimate.get("estimatedTotalSeconds") tts_seconds = estimate.get("estimatedTtsSeconds") if isinstance(total_seconds, (int, float)) and total_seconds >= 3600: steps.append("Estimated runtime is long. Use the Docker worker or an always-on computer, and process a small sample first.") elif isinstance(total_seconds, (int, float)) and total_seconds >= 600: steps.append("Estimated runtime is more than a few minutes. Keep the browser open or use the worker path for the full book.") if isinstance(tts_seconds, (int, float)) and result.get("smokeVoiceId") == "silma-local" and tts_seconds >= 1800: steps.append("SILMA sounds better but may be slow for the full book. Use --voice-id espeak-ar-clear for a faster fallback smoke test.") return steps def resolve_smoke_voice(voice_id: str | None = None) -> str: if voice_id and voice_id != "auto": return voice_id if main.find_silma_python() is not None or main.importlib.util.find_spec("silma_tts") is not None: return "silma-local" if main.find_habibi_python() is not None: return "habibi-msa" if main.find_supertonic_python() is not None or main.importlib.util.find_spec("supertonic") is not None: return "supertonic-ar" if main.find_espeak_ng() is not None: return "espeak-ar-clear" return "silma-local" def write_env_snippet(path: Path, result: dict[str, Any]) -> None: env = result.get("recommendedEnv") or {} path.parent.mkdir(parents=True, exist_ok=True) lines = [ "# Arabic PDF Reader OCR settings", "# Generated by scripts/prepare_book_workflow.py", f"# Source PDF: {result.get('pdf', '')}", f"# Sample PDF: {result.get('sample', {}).get('output', '')}", "", ] if not env: lines.append("# No OCR settings were needed for this sample.") else: for key in sorted(env): lines.append(f"{key}={env[key]}") path.write_text("\n".join(lines) + "\n", encoding="utf-8") def markdown_value(value: Any) -> str: if value is None or value == "": return "-" if isinstance(value, bool): return "yes" if value else "no" return str(value) def fenced_block(language: str, text: str) -> list[str]: return [f"```{language}", text.strip() or "-", "```"] def write_markdown_report(path: Path, result: dict[str, Any]) -> None: selected = result.get("selected") or {} sample = result.get("sample") or {} dry_run = result.get("dryRun") or {} audio = result.get("audioSmoke") or {} voice_benchmark = result.get("voiceBenchmark") or {} estimate = result.get("estimateFullBook") or {} commands = result.get("commands") or {} next_steps = result.get("nextSteps") or [] selected_pages = ", ".join(str(page) for page in sample.get("pages", [])) or "-" command_text = "\n".join(command for command in commands.values() if command) benchmark_lines = [ "| Engine | Quality | Score | Arabic words | Fragment ratio | Extraction | Notes |", "| --- | --- | --- | --- | --- | --- | --- |", ] for item in result.get("benchmark") or []: notes = "; ".join(item.get("qualityReasons") or []) if not item.get("ok"): notes = item.get("error") or "failed" benchmark_lines.append( "| " + " | ".join( [ markdown_value(item.get("engine")), markdown_value(item.get("quality") if item.get("ok") else "failed"), markdown_value(item.get("qualityScore")), markdown_value(item.get("arabicWords")), markdown_value(item.get("fragmentLineRatio")), markdown_value(item.get("extraction")), markdown_value(notes), ] ) + " |" ) lines = [ "# Arabic Audio Preparation Report", "", "## Book", "", f"- PDF: {markdown_value(result.get('pdf'))}", f"- Total pages: {markdown_value(result.get('totalPages'))}", f"- Sample PDF: {markdown_value(sample.get('output'))}", f"- Sample pages: {selected_pages}", f"- Engine preset: {markdown_value(result.get('enginePreset'))}", "", "## Benchmark Results", "", *benchmark_lines, "", "## Selected OCR", "", f"- Extraction: {markdown_value(selected.get('extraction'))}", f"- Engine: {markdown_value(selected.get('engine'))}", f"- Quality score: {markdown_value(selected.get('qualityScore'))}", f"- Arabic words: {markdown_value(selected.get('arabicWords'))}", f"- Sample OCR time: {markdown_value(selected.get('seconds'))} seconds", "", "## Recommended OCR Settings", "", *fenced_block("text", result.get("recommendedEnvText") or "No OCR settings were needed for this sample."), "", "## Dry Run", "", f"- Quality: {markdown_value(dry_run.get('quality'))}", f"- Quality reasons: {markdown_value('; '.join(dry_run.get('qualityReasons') or []))}", f"- Ready for TTS: {markdown_value(dry_run.get('readyForTts'))}", f"- Speech characters: {markdown_value(dry_run.get('speechCharacters'))}", f"- One-letter Arabic word ratio: {markdown_value(dry_run.get('singleArabicWordRatio'))}", f"- Low-information line ratio: {markdown_value(dry_run.get('fragmentLineRatio'))}", f"- Chunks: {markdown_value(dry_run.get('chunks'))}", f"- Extraction: {markdown_value(dry_run.get('extraction'))}", "", ] if audio: lines.extend( [ "## Audio Smoke", "", f"- Voice: {markdown_value(result.get('smokeVoiceId') or audio.get('voiceId'))}", f"- Engine: {markdown_value(audio.get('engine'))}", f"- Speech characters synthesized: {markdown_value(audio.get('audioSpeechCharacters'))}", f"- Audio seconds: {markdown_value(audio.get('seconds'))}", f"- Output: {markdown_value(audio.get('path'))}", "", ] ) else: lines.extend(["## Audio Smoke", "", "- Not run. Use `--verify-audio` to create a short pronunciation sample.", ""]) if voice_benchmark: lines.extend( [ "## Voice Benchmark", "", f"- Output directory: {markdown_value(voice_benchmark.get('outputDir'))}", f"- Text characters: {markdown_value(voice_benchmark.get('textCharacters'))}", f"- Audio format: {markdown_value(voice_benchmark.get('audioFormat'))}", f"- Recommended starting voice: {markdown_value((voice_benchmark.get('recommended') or {}).get('voiceId'))}", f"- Fastest successful voice: {markdown_value((voice_benchmark.get('fastest') or {}).get('voiceId'))}", "", "| Voice | Label | Engine | Status | Time | Audio | Notes |", "| --- | --- | --- | --- | ---: | --- | --- |", ] ) for item in voice_benchmark.get("results", []): status = "ok" if item.get("ok") else "failed" elapsed = item.get("elapsedSeconds", "-") audio_path = item.get("path", "-") notes = item.get("error", "") lines.append( f"| {markdown_value(item.get('voiceId'))} | {markdown_value(item.get('label'))} | " f"{markdown_value(item.get('engine'))} | {status} | {elapsed} | {audio_path} | {notes} |" ) lines.append("") elif result.get("voiceBenchmarkRequested"): lines.extend(["## Voice Benchmark", "", "- Not run because no usable cleaned OCR sample was available.", ""]) lines.extend( [ "## Full Book Estimate", "", f"- Estimated OCR time: {markdown_value(estimate.get('estimatedOcrTime'))}", f"- Estimated TTS time: {markdown_value(estimate.get('estimatedTtsTime'))}", f"- Estimated total time: {markdown_value(estimate.get('estimatedTotalTime'))}", f"- Estimated speech characters: {markdown_value(estimate.get('estimatedSpeechCharacters'))}", f"- Basis: {markdown_value(estimate.get('basis'))}", "", "> Estimates are based on the selected sample pages. Dense scanned pages, marginal scans, and different fonts can change runtime and quality.", "", "## Commands", "", *fenced_block("powershell", command_text), "", "## Next Steps", "", ] ) if next_steps: lines.extend(f"- {step}" for step in next_steps) else: lines.append("- No next steps were generated.") lines.append("") path.parent.mkdir(parents=True, exist_ok=True) path.write_text("\n".join(lines), encoding="utf-8") def prepare_book_workflow( pdf_path: Path, sample_pages: int = 1, skip_first: int = 0, engines: list[str] | None = None, engine_preset: str = "balanced", chunk_size: int = 900, verify_audio: bool = False, voice_id: str | None = "auto", audio_out: Path | None = None, audio_max_chars: int = 1200, benchmark_voices: bool = False, voice_ids: list[str] | None = None, voice_benchmark_out_dir: Path | None = None, voice_benchmark_format: str = "wav", voice_benchmark_max_chars: int | None = None, ) -> dict[str, Any]: if not pdf_path.exists(): raise FileNotFoundError(f"PDF not found: {pdf_path}") if pdf_path.suffix.lower() != ".pdf": raise ValueError("Input must be a PDF file.") with fitz.open(pdf_path) as document: total_pages = document.page_count if engine_preset not in ENGINE_PRESETS: raise ValueError(f"Unknown engine preset: {engine_preset}") engines = engines or ENGINE_PRESETS[engine_preset] sample_pdf = Path(tempfile.gettempdir()) / f"{pdf_path.stem}-arabic-audio-sample-{sample_pages}.pdf" sample_info = build_test_pdf(pdf_path, sample_pdf, count=sample_pages, skip_first=skip_first) benchmark_results = [benchmark_engine(sample_pdf, engine) for engine in engines] best = choose_best_result(benchmark_results) if best is None: return { "pdf": str(pdf_path), "sample": sample_info, "benchmark": benchmark_results, "ready": False, "error": "No OCR engine produced usable Arabic text on the sample.", } extraction = str(best.get("extraction") or "") recommendation = best.get("recommendation") speech_sample_chars = voice_benchmark_max_chars if voice_benchmark_max_chars is not None else audio_max_chars dry_run = dry_run_pdf( sample_pdf, chunk_size=chunk_size, from_extraction=extraction, speech_sample_chars=speech_sample_chars, ) audio_result = None voice_benchmark_result = None resolved_voice_id = resolve_smoke_voice(voice_id) if verify_audio: output = audio_out or (ROOT_DIR / "outputs" / f"{pdf_path.stem}-sample-smoke.wav") audio_result = verify_pipeline( sample_pdf, resolved_voice_id, output, from_extraction=extraction, max_speech_chars=audio_max_chars, ) if benchmark_voices and dry_run.get("readyForTts"): sample_text = str(dry_run.get("speechSampleText") or dry_run.get("speechPreview") or "").strip() if sample_text: output_dir = voice_benchmark_out_dir or (ROOT_DIR / "outputs" / f"{pdf_path.stem}-voice-benchmark") selected_voices = voice_ids or DEFAULT_VOICES voice_benchmark_result = benchmark_voice_set( voices=selected_voices, text=sample_text, output_dir=output_dir, audio_format=voice_benchmark_format, ) estimate = estimate_full_book( total_pages=total_pages, sample_page_count=len(sample_info.get("pages", [])) or sample_pages, selected=best, dry_run=dry_run, audio_smoke=audio_result, ) commands = build_commands( pdf_path=pdf_path, extraction=extraction, voice_id=resolved_voice_id, audio_max_chars=audio_max_chars, audio_out=audio_out, ) result = { "pdf": str(pdf_path), "totalPages": total_pages, "sample": sample_info, "benchmark": benchmark_results, "enginePreset": engine_preset if engines == ENGINE_PRESETS[engine_preset] else "custom", "selected": best, "recommendation": recommendation, "recommendedEnv": recommendation.get("env", {}) if recommendation else {}, "recommendedEnvText": env_text(recommendation.get("env", {})) if recommendation else "", "dryRun": dry_run, "audioSmoke": audio_result, "smokeVoiceId": resolved_voice_id, "voiceBenchmark": voice_benchmark_result, "voiceBenchmarkRequested": benchmark_voices, "estimateFullBook": estimate, "commands": commands, "ready": bool(dry_run.get("readyForTts") and (audio_result is not None if verify_audio else True)), } result["nextSteps"] = build_next_steps(result) return result def print_summary(result: dict[str, Any]) -> None: selected = result.get("selected") or {} recommendation = result.get("recommendation") or {} dry_run = result.get("dryRun") or {} print("Arabic book preparation") print(f"Sample: {result.get('sample', {}).get('output', '-')}") print(f"Selected OCR: {selected.get('extraction', '-')} score={selected.get('qualityScore', '-')}") if recommendation: print(f"Full-book settings: {recommendation.get('summary')}") print( f"Dry run: quality={dry_run.get('quality', '-')} readyForTts={dry_run.get('readyForTts', False)} " f"speechChars={dry_run.get('speechCharacters', 0)}" ) audio = result.get("audioSmoke") if audio: print(f"Audio smoke: {audio.get('path')} {audio.get('seconds')}s {audio.get('bytes')} bytes") voice_benchmark = result.get("voiceBenchmark") or {} if voice_benchmark: successful = [item for item in voice_benchmark.get("results", []) if item.get("ok")] print(f"Voice benchmark: {len(successful)}/{len(voice_benchmark.get('results', []))} voices wrote to {voice_benchmark.get('outputDir')}") estimate = result.get("estimateFullBook") or {} if estimate: print( f"Estimate: OCR {estimate.get('estimatedOcrTime')} " f"TTS {estimate.get('estimatedTtsTime')} total {estimate.get('estimatedTotalTime')}" ) steps = result.get("nextSteps") or [] if steps: print("Next steps:") for step in steps: print(f"- {step}") print(f"Ready: {'yes' if result.get('ready') else 'no'}") def main_cli() -> None: if hasattr(sys.stdout, "reconfigure"): sys.stdout.reconfigure(encoding="utf-8", errors="replace") parser = argparse.ArgumentParser(description="Benchmark, dry-run, and optionally audio-smoke an Arabic PDF sample.") parser.add_argument("pdf", type=Path, help="Arabic PDF to prepare") parser.add_argument("--sample-pages", type=int, default=1, help="Number of informative pages to sample.") parser.add_argument("--skip-first", type=int, default=0, help="Ignore the first N pages when selecting sample pages.") parser.add_argument( "--engine-preset", choices=sorted(ENGINE_PRESETS), default="balanced", help="OCR engine preset to use when --engines is not provided. balanced is the recommended free Arabic-trained stack.", ) parser.add_argument( "--engines", nargs="+", default=None, choices=[ "arabic", "arabic-max", "qari-ocr", "tawkeed-ocr", "katib-ocr", "arabic-qwen-ocr", "arabic-glm-ocr", "baseer-ocr", "easyocr", "paddleocr", "paddleocr-vl", "surya", "tesseract", "auto", "best", ], help="OCR engines to benchmark on the sample.", ) parser.add_argument("--chunk-size", type=int, default=900, help="Dry-run chunk size.") parser.add_argument("--verify-audio", action="store_true", help="Also create a short audio smoke test from the sample.") parser.add_argument("--voice-id", default="auto", help="Local voice id for --verify-audio. Use auto to prefer SILMA when installed.") parser.add_argument("--audio-out", type=Path, help="Audio output path for --verify-audio.") parser.add_argument( "--audio-max-chars", type=int, default=1200, help="Maximum cleaned characters to synthesize for --verify-audio.", ) parser.add_argument( "--benchmark-voices", action="store_true", help="Compare local voices using the cleaned OCR sample text.", ) parser.add_argument( "--voices", nargs="+", choices=list(main.LOCAL_VOICES), help="Voice ids to compare with --benchmark-voices.", ) parser.add_argument( "--voice-benchmark-out-dir", type=Path, help="Output directory for --benchmark-voices audio files.", ) parser.add_argument( "--voice-benchmark-format", choices=["wav", "mp3"], default="wav", help="Audio format for --benchmark-voices.", ) parser.add_argument( "--voice-benchmark-max-chars", type=int, help="Maximum cleaned OCR characters to use for --benchmark-voices. Defaults to --audio-max-chars.", ) parser.add_argument("--json", action="store_true", help="Print JSON instead of a compact summary.") parser.add_argument( "--write-env", type=Path, help="Write recommended OCR settings to a small .env snippet. Secrets are never written.", ) parser.add_argument( "--write-report", type=Path, help="Write a readable Markdown report with OCR settings, estimates, commands, and next steps.", ) args = parser.parse_args() result = prepare_book_workflow( args.pdf, sample_pages=args.sample_pages, skip_first=args.skip_first, engines=args.engines, engine_preset=args.engine_preset, chunk_size=args.chunk_size, verify_audio=args.verify_audio, voice_id=args.voice_id, audio_out=args.audio_out, audio_max_chars=args.audio_max_chars, benchmark_voices=args.benchmark_voices, voice_ids=args.voices, voice_benchmark_out_dir=args.voice_benchmark_out_dir, voice_benchmark_format=args.voice_benchmark_format, voice_benchmark_max_chars=args.voice_benchmark_max_chars, ) if args.write_env: write_env_snippet(args.write_env, result) result["writtenEnv"] = str(args.write_env) result["commands"] = build_commands( pdf_path=args.pdf, extraction=str(result.get("selected", {}).get("extraction") or ""), voice_id=str(result.get("smokeVoiceId") or args.voice_id), audio_max_chars=args.audio_max_chars, audio_out=args.audio_out, env_file=args.write_env, ) result["nextSteps"] = build_next_steps(result) if args.write_report: write_markdown_report(args.write_report, result) result["writtenReport"] = str(args.write_report) if args.json: print(json.dumps(result, ensure_ascii=False, indent=2)) else: print_summary(result) if not result.get("ready"): raise SystemExit(1) if __name__ == "__main__": main_cli()