Spaces:

Syncre
/

arabic-audio-reader-worker

Running

File size: 27,990 Bytes

2e1a095

from __future__ import annotations

import argparse
import json
import sys
import tempfile
from pathlib import Path
from typing import Any

import fitz

ROOT_DIR = Path(__file__).resolve().parent.parent
if str(ROOT_DIR) not in sys.path:
    sys.path.insert(0, str(ROOT_DIR))

from scripts.benchmark_ocr import benchmark_engine
from scripts.benchmark_voices import DEFAULT_VOICES, benchmark_voices as benchmark_voice_set
from scripts.dry_run_pdf import dry_run_pdf
from scripts.select_test_pages import build_test_pdf
from scripts.verify_pipeline import verify_pipeline
from app import main


ENGINE_PRESETS: dict[str, list[str]] = {
    "practical": ["arabic", "paddleocr", "tesseract"],
    "balanced": [
        "arabic-max",
        "arabic",
        "tawkeed-ocr",
        "katib-ocr",
        "arabic-qwen-ocr",
        "arabic-glm-ocr",
        "baseer-ocr",
        "paddleocr",
        "tesseract",
    ],
    "maximum": [
        "arabic-max",
        "arabic",
        "tawkeed-ocr",
        "katib-ocr",
        "arabic-qwen-ocr",
        "arabic-glm-ocr",
        "baseer-ocr",
        "qari-ocr",
        "paddleocr-vl",
        "paddleocr",
        "tesseract",
    ],
}


def choose_best_result(results: list[dict[str, Any]]) -> dict[str, Any] | None:
    successful = [item for item in results if item.get("ok")]
    if not successful:
        return None
    return max(successful, key=lambda item: (item.get("qualityScore", 0), item.get("arabicWords", 0), -item.get("seconds", 0)))


def env_text(env: dict[str, str]) -> str:
    return " ".join(f"{key}={value}" for key, value in env.items())


def quote_arg(value: str | Path) -> str:
    text = str(value)
    if not text:
        return '""'
    if any(char.isspace() for char in text):
        return f'"{text}"'
    return text


def build_commands(
    pdf_path: Path,
    extraction: str,
    voice_id: str,
    audio_max_chars: int,
    audio_out: Path | None = None,
    env_file: Path | None = None,
    external_tts_out_dir: Path | None = None,
    external_ocr_out_dir: Path | None = None,
) -> dict[str, str]:
    pdf = quote_arg(pdf_path)
    extraction_arg = quote_arg(extraction)
    settings_arg = f"--env-file {quote_arg(env_file)}" if env_file else f"--from-extraction {extraction_arg}"
    smoke_out = quote_arg(audio_out or (ROOT_DIR / "outputs" / f"{pdf_path.stem}-sample-smoke.wav"))
    tts_sample_out = quote_arg(external_tts_out_dir or (ROOT_DIR / "outputs" / "external-tts-sample"))
    ocr_sample_out = quote_arg(external_ocr_out_dir or (ROOT_DIR / "outputs" / "external-ocr-sample"))
    return {
        "dryRunRecommended": f"python scripts\\dry_run_pdf.py {pdf} {settings_arg}",
        "audioSmokeRecommended": (
            f"python scripts\\verify_pipeline.py --pdf {pdf} {settings_arg} "
            f"--voice-id {voice_id} --max-speech-chars {audio_max_chars} --out {smoke_out}"
        ),
        "externalTtsSample": f"python scripts\\export_tts_sample.py {pdf} {settings_arg} --out-dir {tts_sample_out}",
        "externalOcrSample": f"python scripts\\export_ocr_sample_images.py {pdf} --out-dir {ocr_sample_out}",
        "fullPipelineRecommended": (
            f"python scripts\\verify_pipeline.py --pdf {pdf} {settings_arg} "
            f"--voice-id {voice_id} --out outputs\\full-book-smoke.wav"
        ),
    }


def seconds_label(seconds: float | None) -> str:
    if seconds is None:
        return "unknown"
    if seconds < 60:
        return f"{round(seconds, 1)} seconds"
    minutes = seconds / 60
    if minutes < 60:
        return f"{round(minutes, 1)} minutes"
    return f"{round(minutes / 60, 1)} hours"


def estimate_full_book(
    total_pages: int,
    sample_page_count: int,
    selected: dict[str, Any],
    dry_run: dict[str, Any],
    audio_smoke: dict[str, Any] | None = None,
) -> dict[str, Any]:
    sample_page_count = max(sample_page_count, 1)
    pages_multiplier = total_pages / sample_page_count if total_pages else 0
    ocr_seconds_per_page = float(selected.get("seconds") or 0) / sample_page_count
    speech_chars_per_page = float(dry_run.get("speechCharacters") or 0) / sample_page_count
    estimated_ocr_seconds = round(ocr_seconds_per_page * total_pages, 2) if total_pages else None
    estimated_speech_chars = int(round(speech_chars_per_page * total_pages)) if total_pages else None

    tts_seconds_per_char = None
    estimated_tts_seconds = None
    if audio_smoke:
        audio_chars = int(audio_smoke.get("audioSpeechCharacters") or 0)
        elapsed = float(audio_smoke.get("elapsedSeconds") or 0)
        if audio_chars > 0 and elapsed > 0 and estimated_speech_chars is not None:
            tts_seconds_per_char = elapsed / audio_chars
            estimated_tts_seconds = round(tts_seconds_per_char * estimated_speech_chars, 2)

    estimated_total_seconds = None
    if estimated_ocr_seconds is not None:
        estimated_total_seconds = estimated_ocr_seconds + (estimated_tts_seconds or 0)

    return {
        "basis": "sample",
        "totalPages": total_pages,
        "samplePages": sample_page_count,
        "pagesMultiplier": round(pages_multiplier, 2),
        "ocrSecondsPerPage": round(ocr_seconds_per_page, 2),
        "estimatedOcrSeconds": estimated_ocr_seconds,
        "estimatedOcrTime": seconds_label(estimated_ocr_seconds),
        "speechCharactersPerPage": round(speech_chars_per_page, 2),
        "estimatedSpeechCharacters": estimated_speech_chars,
        "ttsSecondsPerCharacter": round(tts_seconds_per_char, 5) if tts_seconds_per_char is not None else None,
        "estimatedTtsSeconds": estimated_tts_seconds,
        "estimatedTtsTime": seconds_label(estimated_tts_seconds),
        "estimatedTotalSeconds": round(estimated_total_seconds, 2) if estimated_total_seconds is not None else None,
        "estimatedTotalTime": seconds_label(estimated_total_seconds),
        "note": "Estimate is based on selected sample pages; dense or scanned pages can vary a lot.",
    }


def build_next_steps(result: dict[str, Any]) -> list[str]:
    steps: list[str] = []
    dry_run = result.get("dryRun") or {}
    estimate = result.get("estimateFullBook") or {}
    env = result.get("recommendedEnvText") or ""
    audio = result.get("audioSmoke")

    if not dry_run.get("readyForTts"):
        steps.append("OCR text is not ready for TTS. Try more sample pages, another OCR engine, or higher render zoom before creating audio.")
        return steps

    if dry_run.get("quality") == "warning":
        reasons = "; ".join(dry_run.get("qualityReasons") or [])
        detail = f" Warning reasons: {reasons}." if reasons else ""
        steps.append(f"OCR is usable but should be checked before full-book TTS.{detail}")
    else:
        steps.append("OCR quality is usable for TTS on the selected sample.")
    if env:
        steps.append(f"Apply these OCR settings for the full book: {env}.")
    if audio:
        steps.append("Listen to the audio smoke file before processing the full book.")
    else:
        steps.append("Run again with --verify-audio to check pronunciation before processing the full book.")
    voice_benchmark = result.get("voiceBenchmark") or {}
    if voice_benchmark.get("ready"):
        steps.append("Listen to the voice benchmark files and choose the most natural Arabic voice before the full-book run.")
    elif result.get("voiceBenchmarkRequested"):
        steps.append("Voice benchmarking did not produce audio. Run scripts/preflight_check.py and install the missing local voice setup.")
    commands = result.get("commands") or {}
    if commands.get("dryRunRecommended"):
        steps.append(f"Recommended dry run command: {commands['dryRunRecommended']}")
    if commands.get("externalTtsSample"):
        steps.append(f"External voice comparison sample: {commands['externalTtsSample']}")
    if commands.get("externalOcrSample"):
        steps.append(f"External OCR model image sample: {commands['externalOcrSample']}")

    total_seconds = estimate.get("estimatedTotalSeconds")
    tts_seconds = estimate.get("estimatedTtsSeconds")
    if isinstance(total_seconds, (int, float)) and total_seconds >= 3600:
        steps.append("Estimated runtime is long. Use the Docker worker or an always-on computer, and process a small sample first.")
    elif isinstance(total_seconds, (int, float)) and total_seconds >= 600:
        steps.append("Estimated runtime is more than a few minutes. Keep the browser open or use the worker path for the full book.")
    if isinstance(tts_seconds, (int, float)) and result.get("smokeVoiceId") == "silma-local" and tts_seconds >= 1800:
        steps.append("SILMA sounds better but may be slow for the full book. Use --voice-id espeak-ar-clear for a faster fallback smoke test.")
    return steps


def resolve_smoke_voice(voice_id: str | None = None) -> str:
    if voice_id and voice_id != "auto":
        return voice_id
    if main.find_silma_python() is not None or main.importlib.util.find_spec("silma_tts") is not None:
        return "silma-local"
    if main.find_habibi_python() is not None:
        return "habibi-msa"
    if main.find_supertonic_python() is not None or main.importlib.util.find_spec("supertonic") is not None:
        return "supertonic-ar"
    if main.find_espeak_ng() is not None:
        return "espeak-ar-clear"
    return "silma-local"


def write_env_snippet(path: Path, result: dict[str, Any]) -> None:
    env = result.get("recommendedEnv") or {}
    path.parent.mkdir(parents=True, exist_ok=True)
    lines = [
        "# Arabic PDF Reader OCR settings",
        "# Generated by scripts/prepare_book_workflow.py",
        f"# Source PDF: {result.get('pdf', '')}",
        f"# Sample PDF: {result.get('sample', {}).get('output', '')}",
        "",
    ]
    if not env:
        lines.append("# No OCR settings were needed for this sample.")
    else:
        for key in sorted(env):
            lines.append(f"{key}={env[key]}")
    path.write_text("\n".join(lines) + "\n", encoding="utf-8")


def markdown_value(value: Any) -> str:
    if value is None or value == "":
        return "-"
    if isinstance(value, bool):
        return "yes" if value else "no"
    return str(value)


def fenced_block(language: str, text: str) -> list[str]:
    return [f"```{language}", text.strip() or "-", "```"]


def write_markdown_report(path: Path, result: dict[str, Any]) -> None:
    selected = result.get("selected") or {}
    sample = result.get("sample") or {}
    dry_run = result.get("dryRun") or {}
    audio = result.get("audioSmoke") or {}
    voice_benchmark = result.get("voiceBenchmark") or {}
    estimate = result.get("estimateFullBook") or {}
    commands = result.get("commands") or {}
    next_steps = result.get("nextSteps") or []

    selected_pages = ", ".join(str(page) for page in sample.get("pages", [])) or "-"
    command_text = "\n".join(command for command in commands.values() if command)
    benchmark_lines = [
        "| Engine | Quality | Score | Arabic words | Fragment ratio | Extraction | Notes |",
        "| --- | --- | --- | --- | --- | --- | --- |",
    ]
    for item in result.get("benchmark") or []:
        notes = "; ".join(item.get("qualityReasons") or [])
        if not item.get("ok"):
            notes = item.get("error") or "failed"
        benchmark_lines.append(
            "| "
            + " | ".join(
                [
                    markdown_value(item.get("engine")),
                    markdown_value(item.get("quality") if item.get("ok") else "failed"),
                    markdown_value(item.get("qualityScore")),
                    markdown_value(item.get("arabicWords")),
                    markdown_value(item.get("fragmentLineRatio")),
                    markdown_value(item.get("extraction")),
                    markdown_value(notes),
                ]
            )
            + " |"
        )
    lines = [
        "# Arabic Audio Preparation Report",
        "",
        "## Book",
        "",
        f"- PDF: {markdown_value(result.get('pdf'))}",
        f"- Total pages: {markdown_value(result.get('totalPages'))}",
        f"- Sample PDF: {markdown_value(sample.get('output'))}",
        f"- Sample pages: {selected_pages}",
        f"- Engine preset: {markdown_value(result.get('enginePreset'))}",
        "",
        "## Benchmark Results",
        "",
        *benchmark_lines,
        "",
        "## Selected OCR",
        "",
        f"- Extraction: {markdown_value(selected.get('extraction'))}",
        f"- Engine: {markdown_value(selected.get('engine'))}",
        f"- Quality score: {markdown_value(selected.get('qualityScore'))}",
        f"- Arabic words: {markdown_value(selected.get('arabicWords'))}",
        f"- Sample OCR time: {markdown_value(selected.get('seconds'))} seconds",
        "",
        "## Recommended OCR Settings",
        "",
        *fenced_block("text", result.get("recommendedEnvText") or "No OCR settings were needed for this sample."),
        "",
        "## Dry Run",
        "",
        f"- Quality: {markdown_value(dry_run.get('quality'))}",
        f"- Quality reasons: {markdown_value('; '.join(dry_run.get('qualityReasons') or []))}",
        f"- Ready for TTS: {markdown_value(dry_run.get('readyForTts'))}",
        f"- Speech characters: {markdown_value(dry_run.get('speechCharacters'))}",
        f"- One-letter Arabic word ratio: {markdown_value(dry_run.get('singleArabicWordRatio'))}",
        f"- Low-information line ratio: {markdown_value(dry_run.get('fragmentLineRatio'))}",
        f"- Chunks: {markdown_value(dry_run.get('chunks'))}",
        f"- Extraction: {markdown_value(dry_run.get('extraction'))}",
        "",
    ]

    if audio:
        lines.extend(
            [
                "## Audio Smoke",
                "",
                f"- Voice: {markdown_value(result.get('smokeVoiceId') or audio.get('voiceId'))}",
                f"- Engine: {markdown_value(audio.get('engine'))}",
                f"- Speech characters synthesized: {markdown_value(audio.get('audioSpeechCharacters'))}",
                f"- Audio seconds: {markdown_value(audio.get('seconds'))}",
                f"- Output: {markdown_value(audio.get('path'))}",
                "",
            ]
        )
    else:
        lines.extend(["## Audio Smoke", "", "- Not run. Use `--verify-audio` to create a short pronunciation sample.", ""])

    if voice_benchmark:
        lines.extend(
            [
                "## Voice Benchmark",
                "",
                f"- Output directory: {markdown_value(voice_benchmark.get('outputDir'))}",
                f"- Text characters: {markdown_value(voice_benchmark.get('textCharacters'))}",
                f"- Audio format: {markdown_value(voice_benchmark.get('audioFormat'))}",
                f"- Recommended starting voice: {markdown_value((voice_benchmark.get('recommended') or {}).get('voiceId'))}",
                f"- Fastest successful voice: {markdown_value((voice_benchmark.get('fastest') or {}).get('voiceId'))}",
                "",
                "| Voice | Label | Engine | Status | Time | Audio | Notes |",
                "| --- | --- | --- | --- | ---: | --- | --- |",
            ]
        )
        for item in voice_benchmark.get("results", []):
            status = "ok" if item.get("ok") else "failed"
            elapsed = item.get("elapsedSeconds", "-")
            audio_path = item.get("path", "-")
            notes = item.get("error", "")
            lines.append(
                f"| {markdown_value(item.get('voiceId'))} | {markdown_value(item.get('label'))} | "
                f"{markdown_value(item.get('engine'))} | {status} | {elapsed} | {audio_path} | {notes} |"
            )
        lines.append("")
    elif result.get("voiceBenchmarkRequested"):
        lines.extend(["## Voice Benchmark", "", "- Not run because no usable cleaned OCR sample was available.", ""])

    lines.extend(
        [
            "## Full Book Estimate",
            "",
            f"- Estimated OCR time: {markdown_value(estimate.get('estimatedOcrTime'))}",
            f"- Estimated TTS time: {markdown_value(estimate.get('estimatedTtsTime'))}",
            f"- Estimated total time: {markdown_value(estimate.get('estimatedTotalTime'))}",
            f"- Estimated speech characters: {markdown_value(estimate.get('estimatedSpeechCharacters'))}",
            f"- Basis: {markdown_value(estimate.get('basis'))}",
            "",
            "> Estimates are based on the selected sample pages. Dense scanned pages, marginal scans, and different fonts can change runtime and quality.",
            "",
            "## Commands",
            "",
            *fenced_block("powershell", command_text),
            "",
            "## Next Steps",
            "",
        ]
    )
    if next_steps:
        lines.extend(f"- {step}" for step in next_steps)
    else:
        lines.append("- No next steps were generated.")
    lines.append("")

    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text("\n".join(lines), encoding="utf-8")


def prepare_book_workflow(
    pdf_path: Path,
    sample_pages: int = 1,
    skip_first: int = 0,
    engines: list[str] | None = None,
    engine_preset: str = "balanced",
    chunk_size: int = 900,
    verify_audio: bool = False,
    voice_id: str | None = "auto",
    audio_out: Path | None = None,
    audio_max_chars: int = 1200,
    benchmark_voices: bool = False,
    voice_ids: list[str] | None = None,
    voice_benchmark_out_dir: Path | None = None,
    voice_benchmark_format: str = "wav",
    voice_benchmark_max_chars: int | None = None,
) -> dict[str, Any]:
    if not pdf_path.exists():
        raise FileNotFoundError(f"PDF not found: {pdf_path}")
    if pdf_path.suffix.lower() != ".pdf":
        raise ValueError("Input must be a PDF file.")

    with fitz.open(pdf_path) as document:
        total_pages = document.page_count
    if engine_preset not in ENGINE_PRESETS:
        raise ValueError(f"Unknown engine preset: {engine_preset}")
    engines = engines or ENGINE_PRESETS[engine_preset]
    sample_pdf = Path(tempfile.gettempdir()) / f"{pdf_path.stem}-arabic-audio-sample-{sample_pages}.pdf"
    sample_info = build_test_pdf(pdf_path, sample_pdf, count=sample_pages, skip_first=skip_first)

    benchmark_results = [benchmark_engine(sample_pdf, engine) for engine in engines]
    best = choose_best_result(benchmark_results)
    if best is None:
        return {
            "pdf": str(pdf_path),
            "sample": sample_info,
            "benchmark": benchmark_results,
            "ready": False,
            "error": "No OCR engine produced usable Arabic text on the sample.",
        }

    extraction = str(best.get("extraction") or "")
    recommendation = best.get("recommendation")
    speech_sample_chars = voice_benchmark_max_chars if voice_benchmark_max_chars is not None else audio_max_chars
    dry_run = dry_run_pdf(
        sample_pdf,
        chunk_size=chunk_size,
        from_extraction=extraction,
        speech_sample_chars=speech_sample_chars,
    )
    audio_result = None
    voice_benchmark_result = None
    resolved_voice_id = resolve_smoke_voice(voice_id)
    if verify_audio:
        output = audio_out or (ROOT_DIR / "outputs" / f"{pdf_path.stem}-sample-smoke.wav")
        audio_result = verify_pipeline(
            sample_pdf,
            resolved_voice_id,
            output,
            from_extraction=extraction,
            max_speech_chars=audio_max_chars,
        )
    if benchmark_voices and dry_run.get("readyForTts"):
        sample_text = str(dry_run.get("speechSampleText") or dry_run.get("speechPreview") or "").strip()
        if sample_text:
            output_dir = voice_benchmark_out_dir or (ROOT_DIR / "outputs" / f"{pdf_path.stem}-voice-benchmark")
            selected_voices = voice_ids or DEFAULT_VOICES
            voice_benchmark_result = benchmark_voice_set(
                voices=selected_voices,
                text=sample_text,
                output_dir=output_dir,
                audio_format=voice_benchmark_format,
            )
    estimate = estimate_full_book(
        total_pages=total_pages,
        sample_page_count=len(sample_info.get("pages", [])) or sample_pages,
        selected=best,
        dry_run=dry_run,
        audio_smoke=audio_result,
    )
    commands = build_commands(
        pdf_path=pdf_path,
        extraction=extraction,
        voice_id=resolved_voice_id,
        audio_max_chars=audio_max_chars,
        audio_out=audio_out,
    )

    result = {
        "pdf": str(pdf_path),
        "totalPages": total_pages,
        "sample": sample_info,
        "benchmark": benchmark_results,
        "enginePreset": engine_preset if engines == ENGINE_PRESETS[engine_preset] else "custom",
        "selected": best,
        "recommendation": recommendation,
        "recommendedEnv": recommendation.get("env", {}) if recommendation else {},
        "recommendedEnvText": env_text(recommendation.get("env", {})) if recommendation else "",
        "dryRun": dry_run,
        "audioSmoke": audio_result,
        "smokeVoiceId": resolved_voice_id,
        "voiceBenchmark": voice_benchmark_result,
        "voiceBenchmarkRequested": benchmark_voices,
        "estimateFullBook": estimate,
        "commands": commands,
        "ready": bool(dry_run.get("readyForTts") and (audio_result is not None if verify_audio else True)),
    }
    result["nextSteps"] = build_next_steps(result)
    return result


def print_summary(result: dict[str, Any]) -> None:
    selected = result.get("selected") or {}
    recommendation = result.get("recommendation") or {}
    dry_run = result.get("dryRun") or {}
    print("Arabic book preparation")
    print(f"Sample: {result.get('sample', {}).get('output', '-')}")
    print(f"Selected OCR: {selected.get('extraction', '-')} score={selected.get('qualityScore', '-')}")
    if recommendation:
        print(f"Full-book settings: {recommendation.get('summary')}")
    print(
        f"Dry run: quality={dry_run.get('quality', '-')} readyForTts={dry_run.get('readyForTts', False)} "
        f"speechChars={dry_run.get('speechCharacters', 0)}"
    )
    audio = result.get("audioSmoke")
    if audio:
        print(f"Audio smoke: {audio.get('path')} {audio.get('seconds')}s {audio.get('bytes')} bytes")
    voice_benchmark = result.get("voiceBenchmark") or {}
    if voice_benchmark:
        successful = [item for item in voice_benchmark.get("results", []) if item.get("ok")]
        print(f"Voice benchmark: {len(successful)}/{len(voice_benchmark.get('results', []))} voices wrote to {voice_benchmark.get('outputDir')}")
    estimate = result.get("estimateFullBook") or {}
    if estimate:
        print(
            f"Estimate: OCR {estimate.get('estimatedOcrTime')} "
            f"TTS {estimate.get('estimatedTtsTime')} total {estimate.get('estimatedTotalTime')}"
        )
    steps = result.get("nextSteps") or []
    if steps:
        print("Next steps:")
        for step in steps:
            print(f"- {step}")
    print(f"Ready: {'yes' if result.get('ready') else 'no'}")


def main_cli() -> None:
    if hasattr(sys.stdout, "reconfigure"):
        sys.stdout.reconfigure(encoding="utf-8", errors="replace")
    parser = argparse.ArgumentParser(description="Benchmark, dry-run, and optionally audio-smoke an Arabic PDF sample.")
    parser.add_argument("pdf", type=Path, help="Arabic PDF to prepare")
    parser.add_argument("--sample-pages", type=int, default=1, help="Number of informative pages to sample.")
    parser.add_argument("--skip-first", type=int, default=0, help="Ignore the first N pages when selecting sample pages.")
    parser.add_argument(
        "--engine-preset",
        choices=sorted(ENGINE_PRESETS),
        default="balanced",
        help="OCR engine preset to use when --engines is not provided. balanced is the recommended free Arabic-trained stack.",
    )
    parser.add_argument(
        "--engines",
        nargs="+",
        default=None,
        choices=[
            "arabic",
            "arabic-max",
            "qari-ocr",
            "tawkeed-ocr",
            "katib-ocr",
            "arabic-qwen-ocr",
            "arabic-glm-ocr",
            "baseer-ocr",
            "easyocr",
            "paddleocr",
            "paddleocr-vl",
            "surya",
            "tesseract",
            "auto",
            "best",
        ],
        help="OCR engines to benchmark on the sample.",
    )
    parser.add_argument("--chunk-size", type=int, default=900, help="Dry-run chunk size.")
    parser.add_argument("--verify-audio", action="store_true", help="Also create a short audio smoke test from the sample.")
    parser.add_argument("--voice-id", default="auto", help="Local voice id for --verify-audio. Use auto to prefer SILMA when installed.")
    parser.add_argument("--audio-out", type=Path, help="Audio output path for --verify-audio.")
    parser.add_argument(
        "--audio-max-chars",
        type=int,
        default=1200,
        help="Maximum cleaned characters to synthesize for --verify-audio.",
    )
    parser.add_argument(
        "--benchmark-voices",
        action="store_true",
        help="Compare local voices using the cleaned OCR sample text.",
    )
    parser.add_argument(
        "--voices",
        nargs="+",
        choices=list(main.LOCAL_VOICES),
        help="Voice ids to compare with --benchmark-voices.",
    )
    parser.add_argument(
        "--voice-benchmark-out-dir",
        type=Path,
        help="Output directory for --benchmark-voices audio files.",
    )
    parser.add_argument(
        "--voice-benchmark-format",
        choices=["wav", "mp3"],
        default="wav",
        help="Audio format for --benchmark-voices.",
    )
    parser.add_argument(
        "--voice-benchmark-max-chars",
        type=int,
        help="Maximum cleaned OCR characters to use for --benchmark-voices. Defaults to --audio-max-chars.",
    )
    parser.add_argument("--json", action="store_true", help="Print JSON instead of a compact summary.")
    parser.add_argument(
        "--write-env",
        type=Path,
        help="Write recommended OCR settings to a small .env snippet. Secrets are never written.",
    )
    parser.add_argument(
        "--write-report",
        type=Path,
        help="Write a readable Markdown report with OCR settings, estimates, commands, and next steps.",
    )
    args = parser.parse_args()

    result = prepare_book_workflow(
        args.pdf,
        sample_pages=args.sample_pages,
        skip_first=args.skip_first,
        engines=args.engines,
        engine_preset=args.engine_preset,
        chunk_size=args.chunk_size,
        verify_audio=args.verify_audio,
        voice_id=args.voice_id,
        audio_out=args.audio_out,
        audio_max_chars=args.audio_max_chars,
        benchmark_voices=args.benchmark_voices,
        voice_ids=args.voices,
        voice_benchmark_out_dir=args.voice_benchmark_out_dir,
        voice_benchmark_format=args.voice_benchmark_format,
        voice_benchmark_max_chars=args.voice_benchmark_max_chars,
    )
    if args.write_env:
        write_env_snippet(args.write_env, result)
        result["writtenEnv"] = str(args.write_env)
        result["commands"] = build_commands(
            pdf_path=args.pdf,
            extraction=str(result.get("selected", {}).get("extraction") or ""),
            voice_id=str(result.get("smokeVoiceId") or args.voice_id),
            audio_max_chars=args.audio_max_chars,
            audio_out=args.audio_out,
            env_file=args.write_env,
        )
        result["nextSteps"] = build_next_steps(result)
    if args.write_report:
        write_markdown_report(args.write_report, result)
        result["writtenReport"] = str(args.write_report)
    if args.json:
        print(json.dumps(result, ensure_ascii=False, indent=2))
    else:
        print_summary(result)
    if not result.get("ready"):
        raise SystemExit(1)


if __name__ == "__main__":
    main_cli()