Spaces:

Syncre
/

arabic-audio-reader-worker

Running

App Files Files Community

arabic-audio-reader-worker / scripts /prepare_book_workflow.py

Syncre

Deploy Arabic Audio Reader worker

2e1a095 verified 1 day ago

raw

history blame contribute delete

28 kB

	from __future__ import annotations

	import argparse
	import json
	import sys
	import tempfile
	from pathlib import Path
	from typing import Any

	import fitz

	ROOT_DIR = Path(__file__).resolve().parent.parent
	if str(ROOT_DIR) not in sys.path:
	sys.path.insert(0, str(ROOT_DIR))

	from scripts.benchmark_ocr import benchmark_engine
	from scripts.benchmark_voices import DEFAULT_VOICES, benchmark_voices as benchmark_voice_set
	from scripts.dry_run_pdf import dry_run_pdf
	from scripts.select_test_pages import build_test_pdf
	from scripts.verify_pipeline import verify_pipeline
	from app import main


	ENGINE_PRESETS: dict[str, list[str]] = {
	"practical": ["arabic", "paddleocr", "tesseract"],
	"balanced": [
	"arabic-max",
	"arabic",
	"tawkeed-ocr",
	"katib-ocr",
	"arabic-qwen-ocr",
	"arabic-glm-ocr",
	"baseer-ocr",
	"paddleocr",
	"tesseract",
	],
	"maximum": [
	"arabic-max",
	"arabic",
	"tawkeed-ocr",
	"katib-ocr",
	"arabic-qwen-ocr",
	"arabic-glm-ocr",
	"baseer-ocr",
	"qari-ocr",
	"paddleocr-vl",
	"paddleocr",
	"tesseract",
	],
	}


	def choose_best_result(results: list[dict[str, Any]]) -> dict[str, Any] \| None:
	successful = [item for item in results if item.get("ok")]
	if not successful:
	return None
	return max(successful, key=lambda item: (item.get("qualityScore", 0), item.get("arabicWords", 0), -item.get("seconds", 0)))


	def env_text(env: dict[str, str]) -> str:
	return " ".join(f"{key}={value}" for key, value in env.items())


	def quote_arg(value: str \| Path) -> str:
	text = str(value)
	if not text:
	return '""'
	if any(char.isspace() for char in text):
	return f'"{text}"'
	return text


	def build_commands(
	pdf_path: Path,
	extraction: str,
	voice_id: str,
	audio_max_chars: int,
	audio_out: Path \| None = None,
	env_file: Path \| None = None,
	external_tts_out_dir: Path \| None = None,
	external_ocr_out_dir: Path \| None = None,
	) -> dict[str, str]:
	pdf = quote_arg(pdf_path)
	extraction_arg = quote_arg(extraction)
	settings_arg = f"--env-file {quote_arg(env_file)}" if env_file else f"--from-extraction {extraction_arg}"
	smoke_out = quote_arg(audio_out or (ROOT_DIR / "outputs" / f"{pdf_path.stem}-sample-smoke.wav"))
	tts_sample_out = quote_arg(external_tts_out_dir or (ROOT_DIR / "outputs" / "external-tts-sample"))
	ocr_sample_out = quote_arg(external_ocr_out_dir or (ROOT_DIR / "outputs" / "external-ocr-sample"))
	return {
	"dryRunRecommended": f"python scripts\\dry_run_pdf.py {pdf} {settings_arg}",
	"audioSmokeRecommended": (
	f"python scripts\\verify_pipeline.py --pdf {pdf} {settings_arg} "
	f"--voice-id {voice_id} --max-speech-chars {audio_max_chars} --out {smoke_out}"
	),
	"externalTtsSample": f"python scripts\\export_tts_sample.py {pdf} {settings_arg} --out-dir {tts_sample_out}",
	"externalOcrSample": f"python scripts\\export_ocr_sample_images.py {pdf} --out-dir {ocr_sample_out}",
	"fullPipelineRecommended": (
	f"python scripts\\verify_pipeline.py --pdf {pdf} {settings_arg} "
	f"--voice-id {voice_id} --out outputs\\full-book-smoke.wav"
	),
	}


	def seconds_label(seconds: float \| None) -> str:
	if seconds is None:
	return "unknown"
	if seconds < 60:
	return f"{round(seconds, 1)} seconds"
	minutes = seconds / 60
	if minutes < 60:
	return f"{round(minutes, 1)} minutes"
	return f"{round(minutes / 60, 1)} hours"


	def estimate_full_book(
	total_pages: int,
	sample_page_count: int,
	selected: dict[str, Any],
	dry_run: dict[str, Any],
	audio_smoke: dict[str, Any] \| None = None,
	) -> dict[str, Any]:
	sample_page_count = max(sample_page_count, 1)
	pages_multiplier = total_pages / sample_page_count if total_pages else 0
	ocr_seconds_per_page = float(selected.get("seconds") or 0) / sample_page_count
	speech_chars_per_page = float(dry_run.get("speechCharacters") or 0) / sample_page_count
	estimated_ocr_seconds = round(ocr_seconds_per_page * total_pages, 2) if total_pages else None
	estimated_speech_chars = int(round(speech_chars_per_page * total_pages)) if total_pages else None

	tts_seconds_per_char = None
	estimated_tts_seconds = None
	if audio_smoke:
	audio_chars = int(audio_smoke.get("audioSpeechCharacters") or 0)
	elapsed = float(audio_smoke.get("elapsedSeconds") or 0)
	if audio_chars > 0 and elapsed > 0 and estimated_speech_chars is not None:
	tts_seconds_per_char = elapsed / audio_chars
	estimated_tts_seconds = round(tts_seconds_per_char * estimated_speech_chars, 2)

	estimated_total_seconds = None
	if estimated_ocr_seconds is not None:
	estimated_total_seconds = estimated_ocr_seconds + (estimated_tts_seconds or 0)

	return {
	"basis": "sample",
	"totalPages": total_pages,
	"samplePages": sample_page_count,
	"pagesMultiplier": round(pages_multiplier, 2),
	"ocrSecondsPerPage": round(ocr_seconds_per_page, 2),
	"estimatedOcrSeconds": estimated_ocr_seconds,
	"estimatedOcrTime": seconds_label(estimated_ocr_seconds),
	"speechCharactersPerPage": round(speech_chars_per_page, 2),
	"estimatedSpeechCharacters": estimated_speech_chars,
	"ttsSecondsPerCharacter": round(tts_seconds_per_char, 5) if tts_seconds_per_char is not None else None,
	"estimatedTtsSeconds": estimated_tts_seconds,
	"estimatedTtsTime": seconds_label(estimated_tts_seconds),
	"estimatedTotalSeconds": round(estimated_total_seconds, 2) if estimated_total_seconds is not None else None,
	"estimatedTotalTime": seconds_label(estimated_total_seconds),
	"note": "Estimate is based on selected sample pages; dense or scanned pages can vary a lot.",
	}


	def build_next_steps(result: dict[str, Any]) -> list[str]:
	steps: list[str] = []
	dry_run = result.get("dryRun") or {}
	estimate = result.get("estimateFullBook") or {}
	env = result.get("recommendedEnvText") or ""
	audio = result.get("audioSmoke")

	if not dry_run.get("readyForTts"):
	steps.append("OCR text is not ready for TTS. Try more sample pages, another OCR engine, or higher render zoom before creating audio.")
	return steps

	if dry_run.get("quality") == "warning":
	reasons = "; ".join(dry_run.get("qualityReasons") or [])
	detail = f" Warning reasons: {reasons}." if reasons else ""
	steps.append(f"OCR is usable but should be checked before full-book TTS.{detail}")
	else:
	steps.append("OCR quality is usable for TTS on the selected sample.")
	if env:
	steps.append(f"Apply these OCR settings for the full book: {env}.")
	if audio:
	steps.append("Listen to the audio smoke file before processing the full book.")
	else:
	steps.append("Run again with --verify-audio to check pronunciation before processing the full book.")
	voice_benchmark = result.get("voiceBenchmark") or {}
	if voice_benchmark.get("ready"):
	steps.append("Listen to the voice benchmark files and choose the most natural Arabic voice before the full-book run.")
	elif result.get("voiceBenchmarkRequested"):
	steps.append("Voice benchmarking did not produce audio. Run scripts/preflight_check.py and install the missing local voice setup.")
	commands = result.get("commands") or {}
	if commands.get("dryRunRecommended"):
	steps.append(f"Recommended dry run command: {commands['dryRunRecommended']}")
	if commands.get("externalTtsSample"):
	steps.append(f"External voice comparison sample: {commands['externalTtsSample']}")
	if commands.get("externalOcrSample"):
	steps.append(f"External OCR model image sample: {commands['externalOcrSample']}")

	total_seconds = estimate.get("estimatedTotalSeconds")
	tts_seconds = estimate.get("estimatedTtsSeconds")
	if isinstance(total_seconds, (int, float)) and total_seconds >= 3600:
	steps.append("Estimated runtime is long. Use the Docker worker or an always-on computer, and process a small sample first.")
	elif isinstance(total_seconds, (int, float)) and total_seconds >= 600:
	steps.append("Estimated runtime is more than a few minutes. Keep the browser open or use the worker path for the full book.")
	if isinstance(tts_seconds, (int, float)) and result.get("smokeVoiceId") == "silma-local" and tts_seconds >= 1800:
	steps.append("SILMA sounds better but may be slow for the full book. Use --voice-id espeak-ar-clear for a faster fallback smoke test.")
	return steps


	def resolve_smoke_voice(voice_id: str \| None = None) -> str:
	if voice_id and voice_id != "auto":
	return voice_id
	if main.find_silma_python() is not None or main.importlib.util.find_spec("silma_tts") is not None:
	return "silma-local"
	if main.find_habibi_python() is not None:
	return "habibi-msa"
	if main.find_supertonic_python() is not None or main.importlib.util.find_spec("supertonic") is not None:
	return "supertonic-ar"
	if main.find_espeak_ng() is not None:
	return "espeak-ar-clear"
	return "silma-local"


	def write_env_snippet(path: Path, result: dict[str, Any]) -> None:
	env = result.get("recommendedEnv") or {}
	path.parent.mkdir(parents=True, exist_ok=True)
	lines = [
	"# Arabic PDF Reader OCR settings",
	"# Generated by scripts/prepare_book_workflow.py",
	f"# Source PDF: {result.get('pdf', '')}",
	f"# Sample PDF: {result.get('sample', {}).get('output', '')}",
	"",
	]
	if not env:
	lines.append("# No OCR settings were needed for this sample.")
	else:
	for key in sorted(env):
	lines.append(f"{key}={env[key]}")
	path.write_text("\n".join(lines) + "\n", encoding="utf-8")


	def markdown_value(value: Any) -> str:
	if value is None or value == "":
	return "-"
	if isinstance(value, bool):
	return "yes" if value else "no"
	return str(value)


	def fenced_block(language: str, text: str) -> list[str]:
	return [f"```{language}", text.strip() or "-", "```"]


	def write_markdown_report(path: Path, result: dict[str, Any]) -> None:
	selected = result.get("selected") or {}
	sample = result.get("sample") or {}
	dry_run = result.get("dryRun") or {}
	audio = result.get("audioSmoke") or {}
	voice_benchmark = result.get("voiceBenchmark") or {}
	estimate = result.get("estimateFullBook") or {}
	commands = result.get("commands") or {}
	next_steps = result.get("nextSteps") or []

	selected_pages = ", ".join(str(page) for page in sample.get("pages", [])) or "-"
	command_text = "\n".join(command for command in commands.values() if command)
	benchmark_lines = [
	"\| Engine \| Quality \| Score \| Arabic words \| Fragment ratio \| Extraction \| Notes \|",
	"\| --- \| --- \| --- \| --- \| --- \| --- \| --- \|",
	]
	for item in result.get("benchmark") or []:
	notes = "; ".join(item.get("qualityReasons") or [])
	if not item.get("ok"):
	notes = item.get("error") or "failed"
	benchmark_lines.append(
	"\| "
	+ " \| ".join(
	[
	markdown_value(item.get("engine")),
	markdown_value(item.get("quality") if item.get("ok") else "failed"),
	markdown_value(item.get("qualityScore")),
	markdown_value(item.get("arabicWords")),
	markdown_value(item.get("fragmentLineRatio")),
	markdown_value(item.get("extraction")),
	markdown_value(notes),
	]
	)
	+ " \|"
	)
	lines = [
	"# Arabic Audio Preparation Report",
	"",
	"## Book",
	"",
	f"- PDF: {markdown_value(result.get('pdf'))}",
	f"- Total pages: {markdown_value(result.get('totalPages'))}",
	f"- Sample PDF: {markdown_value(sample.get('output'))}",
	f"- Sample pages: {selected_pages}",
	f"- Engine preset: {markdown_value(result.get('enginePreset'))}",
	"",
	"## Benchmark Results",
	"",
	*benchmark_lines,
	"",
	"## Selected OCR",
	"",
	f"- Extraction: {markdown_value(selected.get('extraction'))}",
	f"- Engine: {markdown_value(selected.get('engine'))}",
	f"- Quality score: {markdown_value(selected.get('qualityScore'))}",
	f"- Arabic words: {markdown_value(selected.get('arabicWords'))}",
	f"- Sample OCR time: {markdown_value(selected.get('seconds'))} seconds",
	"",
	"## Recommended OCR Settings",
	"",
	*fenced_block("text", result.get("recommendedEnvText") or "No OCR settings were needed for this sample."),
	"",
	"## Dry Run",
	"",
	f"- Quality: {markdown_value(dry_run.get('quality'))}",
	f"- Quality reasons: {markdown_value('; '.join(dry_run.get('qualityReasons') or []))}",
	f"- Ready for TTS: {markdown_value(dry_run.get('readyForTts'))}",
	f"- Speech characters: {markdown_value(dry_run.get('speechCharacters'))}",
	f"- One-letter Arabic word ratio: {markdown_value(dry_run.get('singleArabicWordRatio'))}",
	f"- Low-information line ratio: {markdown_value(dry_run.get('fragmentLineRatio'))}",
	f"- Chunks: {markdown_value(dry_run.get('chunks'))}",
	f"- Extraction: {markdown_value(dry_run.get('extraction'))}",
	"",
	]

	if audio:
	lines.extend(
	[
	"## Audio Smoke",
	"",
	f"- Voice: {markdown_value(result.get('smokeVoiceId') or audio.get('voiceId'))}",
	f"- Engine: {markdown_value(audio.get('engine'))}",
	f"- Speech characters synthesized: {markdown_value(audio.get('audioSpeechCharacters'))}",
	f"- Audio seconds: {markdown_value(audio.get('seconds'))}",
	f"- Output: {markdown_value(audio.get('path'))}",
	"",
	]
	)
	else:
	lines.extend(["## Audio Smoke", "", "- Not run. Use `--verify-audio` to create a short pronunciation sample.", ""])

	if voice_benchmark:
	lines.extend(
	[
	"## Voice Benchmark",
	"",
	f"- Output directory: {markdown_value(voice_benchmark.get('outputDir'))}",
	f"- Text characters: {markdown_value(voice_benchmark.get('textCharacters'))}",
	f"- Audio format: {markdown_value(voice_benchmark.get('audioFormat'))}",
	f"- Recommended starting voice: {markdown_value((voice_benchmark.get('recommended') or {}).get('voiceId'))}",
	f"- Fastest successful voice: {markdown_value((voice_benchmark.get('fastest') or {}).get('voiceId'))}",
	"",
	"\| Voice \| Label \| Engine \| Status \| Time \| Audio \| Notes \|",
	"\| --- \| --- \| --- \| --- \| ---: \| --- \| --- \|",
	]
	)
	for item in voice_benchmark.get("results", []):
	status = "ok" if item.get("ok") else "failed"
	elapsed = item.get("elapsedSeconds", "-")
	audio_path = item.get("path", "-")
	notes = item.get("error", "")
	lines.append(
	f"\| {markdown_value(item.get('voiceId'))} \| {markdown_value(item.get('label'))} \| "
	f"{markdown_value(item.get('engine'))} \| {status} \| {elapsed} \| {audio_path} \| {notes} \|"
	)
	lines.append("")
	elif result.get("voiceBenchmarkRequested"):
	lines.extend(["## Voice Benchmark", "", "- Not run because no usable cleaned OCR sample was available.", ""])

	lines.extend(
	[
	"## Full Book Estimate",
	"",
	f"- Estimated OCR time: {markdown_value(estimate.get('estimatedOcrTime'))}",
	f"- Estimated TTS time: {markdown_value(estimate.get('estimatedTtsTime'))}",
	f"- Estimated total time: {markdown_value(estimate.get('estimatedTotalTime'))}",
	f"- Estimated speech characters: {markdown_value(estimate.get('estimatedSpeechCharacters'))}",
	f"- Basis: {markdown_value(estimate.get('basis'))}",
	"",
	"> Estimates are based on the selected sample pages. Dense scanned pages, marginal scans, and different fonts can change runtime and quality.",
	"",
	"## Commands",
	"",
	*fenced_block("powershell", command_text),
	"",
	"## Next Steps",
	"",
	]
	)
	if next_steps:
	lines.extend(f"- {step}" for step in next_steps)
	else:
	lines.append("- No next steps were generated.")
	lines.append("")

	path.parent.mkdir(parents=True, exist_ok=True)
	path.write_text("\n".join(lines), encoding="utf-8")


	def prepare_book_workflow(
	pdf_path: Path,
	sample_pages: int = 1,
	skip_first: int = 0,
	engines: list[str] \| None = None,
	engine_preset: str = "balanced",
	chunk_size: int = 900,
	verify_audio: bool = False,
	voice_id: str \| None = "auto",
	audio_out: Path \| None = None,
	audio_max_chars: int = 1200,
	benchmark_voices: bool = False,
	voice_ids: list[str] \| None = None,
	voice_benchmark_out_dir: Path \| None = None,
	voice_benchmark_format: str = "wav",
	voice_benchmark_max_chars: int \| None = None,
	) -> dict[str, Any]:
	if not pdf_path.exists():
	raise FileNotFoundError(f"PDF not found: {pdf_path}")
	if pdf_path.suffix.lower() != ".pdf":
	raise ValueError("Input must be a PDF file.")

	with fitz.open(pdf_path) as document:
	total_pages = document.page_count
	if engine_preset not in ENGINE_PRESETS:
	raise ValueError(f"Unknown engine preset: {engine_preset}")
	engines = engines or ENGINE_PRESETS[engine_preset]
	sample_pdf = Path(tempfile.gettempdir()) / f"{pdf_path.stem}-arabic-audio-sample-{sample_pages}.pdf"
	sample_info = build_test_pdf(pdf_path, sample_pdf, count=sample_pages, skip_first=skip_first)

	benchmark_results = [benchmark_engine(sample_pdf, engine) for engine in engines]
	best = choose_best_result(benchmark_results)
	if best is None:
	return {
	"pdf": str(pdf_path),
	"sample": sample_info,
	"benchmark": benchmark_results,
	"ready": False,
	"error": "No OCR engine produced usable Arabic text on the sample.",
	}

	extraction = str(best.get("extraction") or "")
	recommendation = best.get("recommendation")
	speech_sample_chars = voice_benchmark_max_chars if voice_benchmark_max_chars is not None else audio_max_chars
	dry_run = dry_run_pdf(
	sample_pdf,
	chunk_size=chunk_size,
	from_extraction=extraction,
	speech_sample_chars=speech_sample_chars,
	)
	audio_result = None
	voice_benchmark_result = None
	resolved_voice_id = resolve_smoke_voice(voice_id)
	if verify_audio:
	output = audio_out or (ROOT_DIR / "outputs" / f"{pdf_path.stem}-sample-smoke.wav")
	audio_result = verify_pipeline(
	sample_pdf,
	resolved_voice_id,
	output,
	from_extraction=extraction,
	max_speech_chars=audio_max_chars,
	)
	if benchmark_voices and dry_run.get("readyForTts"):
	sample_text = str(dry_run.get("speechSampleText") or dry_run.get("speechPreview") or "").strip()
	if sample_text:
	output_dir = voice_benchmark_out_dir or (ROOT_DIR / "outputs" / f"{pdf_path.stem}-voice-benchmark")
	selected_voices = voice_ids or DEFAULT_VOICES
	voice_benchmark_result = benchmark_voice_set(
	voices=selected_voices,
	text=sample_text,
	output_dir=output_dir,
	audio_format=voice_benchmark_format,
	)
	estimate = estimate_full_book(
	total_pages=total_pages,
	sample_page_count=len(sample_info.get("pages", [])) or sample_pages,
	selected=best,
	dry_run=dry_run,
	audio_smoke=audio_result,
	)
	commands = build_commands(
	pdf_path=pdf_path,
	extraction=extraction,
	voice_id=resolved_voice_id,
	audio_max_chars=audio_max_chars,
	audio_out=audio_out,
	)

	result = {
	"pdf": str(pdf_path),
	"totalPages": total_pages,
	"sample": sample_info,
	"benchmark": benchmark_results,
	"enginePreset": engine_preset if engines == ENGINE_PRESETS[engine_preset] else "custom",
	"selected": best,
	"recommendation": recommendation,
	"recommendedEnv": recommendation.get("env", {}) if recommendation else {},
	"recommendedEnvText": env_text(recommendation.get("env", {})) if recommendation else "",
	"dryRun": dry_run,
	"audioSmoke": audio_result,
	"smokeVoiceId": resolved_voice_id,
	"voiceBenchmark": voice_benchmark_result,
	"voiceBenchmarkRequested": benchmark_voices,
	"estimateFullBook": estimate,
	"commands": commands,
	"ready": bool(dry_run.get("readyForTts") and (audio_result is not None if verify_audio else True)),
	}
	result["nextSteps"] = build_next_steps(result)
	return result


	def print_summary(result: dict[str, Any]) -> None:
	selected = result.get("selected") or {}
	recommendation = result.get("recommendation") or {}
	dry_run = result.get("dryRun") or {}
	print("Arabic book preparation")
	print(f"Sample: {result.get('sample', {}).get('output', '-')}")
	print(f"Selected OCR: {selected.get('extraction', '-')} score={selected.get('qualityScore', '-')}")
	if recommendation:
	print(f"Full-book settings: {recommendation.get('summary')}")
	print(
	f"Dry run: quality={dry_run.get('quality', '-')} readyForTts={dry_run.get('readyForTts', False)} "
	f"speechChars={dry_run.get('speechCharacters', 0)}"
	)
	audio = result.get("audioSmoke")
	if audio:
	print(f"Audio smoke: {audio.get('path')} {audio.get('seconds')}s {audio.get('bytes')} bytes")
	voice_benchmark = result.get("voiceBenchmark") or {}
	if voice_benchmark:
	successful = [item for item in voice_benchmark.get("results", []) if item.get("ok")]
	print(f"Voice benchmark: {len(successful)}/{len(voice_benchmark.get('results', []))} voices wrote to {voice_benchmark.get('outputDir')}")
	estimate = result.get("estimateFullBook") or {}
	if estimate:
	print(
	f"Estimate: OCR {estimate.get('estimatedOcrTime')} "
	f"TTS {estimate.get('estimatedTtsTime')} total {estimate.get('estimatedTotalTime')}"
	)
	steps = result.get("nextSteps") or []
	if steps:
	print("Next steps:")
	for step in steps:
	print(f"- {step}")
	print(f"Ready: {'yes' if result.get('ready') else 'no'}")


	def main_cli() -> None:
	if hasattr(sys.stdout, "reconfigure"):
	sys.stdout.reconfigure(encoding="utf-8", errors="replace")
	parser = argparse.ArgumentParser(description="Benchmark, dry-run, and optionally audio-smoke an Arabic PDF sample.")
	parser.add_argument("pdf", type=Path, help="Arabic PDF to prepare")
	parser.add_argument("--sample-pages", type=int, default=1, help="Number of informative pages to sample.")
	parser.add_argument("--skip-first", type=int, default=0, help="Ignore the first N pages when selecting sample pages.")
	parser.add_argument(
	"--engine-preset",
	choices=sorted(ENGINE_PRESETS),
	default="balanced",
	help="OCR engine preset to use when --engines is not provided. balanced is the recommended free Arabic-trained stack.",
	)
	parser.add_argument(
	"--engines",
	nargs="+",
	default=None,
	choices=[
	"arabic",
	"arabic-max",
	"qari-ocr",
	"tawkeed-ocr",
	"katib-ocr",
	"arabic-qwen-ocr",
	"arabic-glm-ocr",
	"baseer-ocr",
	"easyocr",
	"paddleocr",
	"paddleocr-vl",
	"surya",
	"tesseract",
	"auto",
	"best",
	],
	help="OCR engines to benchmark on the sample.",
	)
	parser.add_argument("--chunk-size", type=int, default=900, help="Dry-run chunk size.")
	parser.add_argument("--verify-audio", action="store_true", help="Also create a short audio smoke test from the sample.")
	parser.add_argument("--voice-id", default="auto", help="Local voice id for --verify-audio. Use auto to prefer SILMA when installed.")
	parser.add_argument("--audio-out", type=Path, help="Audio output path for --verify-audio.")
	parser.add_argument(
	"--audio-max-chars",
	type=int,
	default=1200,
	help="Maximum cleaned characters to synthesize for --verify-audio.",
	)
	parser.add_argument(
	"--benchmark-voices",
	action="store_true",
	help="Compare local voices using the cleaned OCR sample text.",
	)
	parser.add_argument(
	"--voices",
	nargs="+",
	choices=list(main.LOCAL_VOICES),
	help="Voice ids to compare with --benchmark-voices.",
	)
	parser.add_argument(
	"--voice-benchmark-out-dir",
	type=Path,
	help="Output directory for --benchmark-voices audio files.",
	)
	parser.add_argument(
	"--voice-benchmark-format",
	choices=["wav", "mp3"],
	default="wav",
	help="Audio format for --benchmark-voices.",
	)
	parser.add_argument(
	"--voice-benchmark-max-chars",
	type=int,
	help="Maximum cleaned OCR characters to use for --benchmark-voices. Defaults to --audio-max-chars.",
	)
	parser.add_argument("--json", action="store_true", help="Print JSON instead of a compact summary.")
	parser.add_argument(
	"--write-env",
	type=Path,
	help="Write recommended OCR settings to a small .env snippet. Secrets are never written.",
	)
	parser.add_argument(
	"--write-report",
	type=Path,
	help="Write a readable Markdown report with OCR settings, estimates, commands, and next steps.",
	)
	args = parser.parse_args()

	result = prepare_book_workflow(
	args.pdf,
	sample_pages=args.sample_pages,
	skip_first=args.skip_first,
	engines=args.engines,
	engine_preset=args.engine_preset,
	chunk_size=args.chunk_size,
	verify_audio=args.verify_audio,
	voice_id=args.voice_id,
	audio_out=args.audio_out,
	audio_max_chars=args.audio_max_chars,
	benchmark_voices=args.benchmark_voices,
	voice_ids=args.voices,
	voice_benchmark_out_dir=args.voice_benchmark_out_dir,
	voice_benchmark_format=args.voice_benchmark_format,
	voice_benchmark_max_chars=args.voice_benchmark_max_chars,
	)
	if args.write_env:
	write_env_snippet(args.write_env, result)
	result["writtenEnv"] = str(args.write_env)
	result["commands"] = build_commands(
	pdf_path=args.pdf,
	extraction=str(result.get("selected", {}).get("extraction") or ""),
	voice_id=str(result.get("smokeVoiceId") or args.voice_id),
	audio_max_chars=args.audio_max_chars,
	audio_out=args.audio_out,
	env_file=args.write_env,
	)
	result["nextSteps"] = build_next_steps(result)
	if args.write_report:
	write_markdown_report(args.write_report, result)
	result["writtenReport"] = str(args.write_report)
	if args.json:
	print(json.dumps(result, ensure_ascii=False, indent=2))
	else:
	print_summary(result)
	if not result.get("ready"):
	raise SystemExit(1)


	if __name__ == "__main__":
	main_cli()