from __future__ import annotations import argparse import json import sys from pathlib import Path from typing import Any ROOT_DIR = Path(__file__).resolve().parent.parent if str(ROOT_DIR) not in sys.path: sys.path.insert(0, str(ROOT_DIR)) from app import main from scripts.dry_run_pdf import dry_run_pdf def safe_command_path(path: Path) -> str: text = str(path) return f'"{text}"' if " " in text else text def build_external_commands(text_path: Path, output_dir: Path) -> dict[str, str]: text_arg = safe_command_path(text_path) output_arg = safe_command_path(output_dir) return { "localVoiceBenchmark": ( f"python scripts\\benchmark_voices.py --text-file {text_arg} " f"--out-dir {output_arg}\\local-voices --write-report {output_arg}\\local-voices.md" ), "mossTtsNanoOnnx": ( "moss-tts-nano generate --backend onnx --language ar " f"--text-file {text_arg} --prompt-speech C:\\path\\to\\arabic-reference.wav" ), "mossTtsNanoServer": "moss-tts-nano serve --backend onnx", "supertonicLocal": ( f"python scripts\\benchmark_voices.py --voices supertonic-ar --text-file {text_arg} " f"--out-dir {output_arg}\\supertonic --write-report {output_arg}\\supertonic.md" ), "mishkalaTashkeelExternal": ( "Diacritize the same cleaned sample with flokymind/mishkala and save it beside the plain sample, " f"for example {output_arg}\\arabic-tts-sample-mishkala.txt. Then synthesize both files with the same voice." ), "mishkalaVoiceBenchmark": ( f"python scripts\\benchmark_voices.py --text-file {output_arg}\\arabic-tts-sample-mishkala.txt " f"--out-dir {output_arg}\\mishkala-local-voices --write-report {output_arg}\\mishkala-local-voices.md" ), "tashkeel350External": ( "Diacritize the same cleaned sample with Etherll/Tashkeel-350M and save it beside the plain sample, " f"for example {output_arg}\\arabic-tts-sample-tashkeel350.txt. Then synthesize plain, Mishkala, " "and Tashkeel-350M samples with the same voice before choosing a preprocessor." ), "tashkeel350VoiceBenchmark": ( f"python scripts\\benchmark_voices.py --text-file {output_arg}\\arabic-tts-sample-tashkeel350.txt " f"--out-dir {output_arg}\\tashkeel350-local-voices --write-report {output_arg}\\tashkeel350-local-voices.md" ), "preprocessorListeningScore": ( "python scripts\\score_tts_preprocessor.py " "--rating plain=4,5,5,4,4 --rating mishkala=5,4,4,4,4 --rating tashkeel350=5,4,4,4,4 " f"--write-report {output_arg}\\tts-preprocessor-score.md " f"--write-json {output_arg}\\tts-preprocessor-score.json" ), "voiceListeningScore": ( "python scripts\\score_voice_listening.py " "--rating silma-local=5,4,4,5,5 --rating espeak-ar-clear=3,2,4,3,5 " f"--write-report {output_arg}\\voice-listening-score.md " f"--write-json {output_arg}\\voice-listening-score.json" ), "voicePromotionGate": ( "python scripts\\model_promotion_gate.py " "--candidate-name \"External voice winner\" --kind tts --license Apache-2.0 " f"--score-json {output_arg}\\voice-listening-score.json " "--same-sample --runtime-ok --privacy-ok --human-reviewed " f"--write-report {output_arg}\\voice-promotion-gate.md" ), "preprocessorPromotionGate": ( "python scripts\\model_promotion_gate.py " "--candidate-name \"TTS preprocessor winner\" --kind preprocessor --license Apache-2.0 " f"--score-json {output_arg}\\tts-preprocessor-score.json " "--same-sample --runtime-ok --privacy-ok --human-reviewed " f"--write-report {output_arg}\\preprocessor-promotion-gate.md" ), "omniVoiceExternal": ( "python -m omnivoice.cli " f"--model k2-fsa/OmniVoice --language ar --text-file {text_arg} " "--ref-audio C:\\path\\to\\arabic-reference.wav --ref-text \"Arabic reference transcript\"" ), "omniVoiceArabicLoraExternal": ( "Run OmniVoice with the Arabic LoRA adapter vivooglobal/omnivoice-lora-ar on the same text and " "reference audio after the base OmniVoice benchmark works." ), "tadaExternal": ( "Benchmark HumeAI/tada-3b-ml externally with language=\"ar\" on the same cleaned sample " f"{text_arg}. It is designed to reduce off-script speech, but it reports the Llama 3.2 " "license and is a 3B-class strong-worker path, so keep it outside the permissive default." ), "lahgtnaChatterboxExternal": ( "python -m chatterbox.tts " f"--model oddadmix/lahgtna-chatterbox-v1 --text-file {text_arg} " "--audio-prompt-path C:\\path\\to\\arabic-reference.wav --repetition-penalty 1.25" ), "namaaSaudiTtsExternal": ( "Use ChatterboxMultilingualTTS with the NAMAA-Space/NAMAA-Saudi-TTS safetensors on " f"{text_arg}, language_id=\"ar\", and the same optional reference audio. Benchmark only for " "Saudi/Gulf dialect fit, then compare against SILMA, Habibi, Saudi Arabic Qwen3-TTS, and Emirati voices." ), "saudiChatterboxFineTuneExternal": ( "Use ChatterboxMultilingualTTS with FatimahEmadEldin/saudi-tts-chatterbox-finetuned T3 weights on " f"{text_arg}, language_id=\"ar\", and the same optional reference audio. Benchmark only for " "Saudi/Gulf dialect fit, then compare against NAMAA-Saudi-TTS, SILMA, Habibi, Saudi Arabic Qwen3-TTS, " "and Emirati voices." ), "nileTtsExternal": ( "Benchmark KickItLikeShika/NileTTS-XTTS only for Egyptian/dialectal Arabic using this same cleaned " f"sample {text_arg}. It is Apache-2.0, but not an MSA book default." ), "chatterboxMultilingualExternal": ( "Use ChatterboxMultilingualTTS.from_pretrained(...).generate(text, language_id=\"ar\", " "audio_prompt_path=\"C:\\path\\to\\arabic-reference.wav\") against " f"{text_arg}; compare pacing and pronunciation against SILMA and Habibi." ), "chatterboxMultilingualOnnxExternal": ( "Benchmark onnx-community/chatterbox-multilingual-ONNX with language_id=\"ar\" against " f"{text_arg}; compare CPU/ONNX runtime, repetition, pacing, and pronunciation against SILMA, " "Habibi, and the regular Chatterbox-Multilingual path." ), "ttsArabicOnnxExternal": ( "Benchmark nipponjo/tts-arabic-onnx with the same cleaned sample " f"{text_arg}; try FastPitch, MixerTTS, speaker IDs, pace, and vowelizer options, then compare " "CPU runtime and pronunciation against SILMA, Supertonic, MOSS-TTS-Nano, and Chatterbox ONNX. " "Confirm model/repo licensing before production use." ), "sparkTtsArabicExternal": ( "Spark-TTS Arabic requires the Spark-TTS repo plus diacritized Arabic/reference audio; benchmark it " f"externally with {text_arg} only after preparing that reference workflow." ), "sofeliaTtsExternal": ( "Sofelia-TTS is a Palestinian Arabic/MiraTTS voice-cloning path; benchmark it only for dialectal text " f"using the same sample {text_arg}." ), "arabicF5TtsCaution": ( "Arabic-F5-TTS-v2 is non-commercial and requires fully diacritized Arabic; keep it to personal " "experiments unless that license and input requirement are acceptable." ), "threeArabTtsExternal": ( "Benchmark sherif1313/3arab-TTS-500M-v1 and the VoiceDesign variant on this same cleaned Arabic " f"sample: {text_arg}. It is Apache-2.0 and Arabic-only, but new enough that listenability and " "long-form stability need manual checks before app wiring." ), "voxcpm2External": f"Use {text_arg} as the exact Arabic text sample when testing VoxCPM2 externally.", "voxtralTtsCaution": ( "Voxtral TTS supports Arabic on its model card, but it is CC-BY-NC-4.0 and GPU-heavy; " f"use {text_arg} only for personal/non-commercial strong-worker listening comparisons." ), "qwen3TtsCaution": ( "Do not promote Qwen3-TTS for this Arabic reader until an official Arabic-capable checkpoint " "or Arabic fine-tune is verified on this same sample." ), } def write_markdown_report(path: Path, result: dict[str, Any]) -> None: commands = result["commands"] lines = [ "# External Arabic TTS Sample", "", f"PDF: `{result.get('pdf', '-')}`", f"Text file: `{result['textPath']}`", f"Characters: {result['characters']}", f"Arabic words: {result['arabicWords']}", f"OCR extraction: `{result.get('extraction', '-')}`", f"Quality: `{result['quality']}`", "", "Use this same cleaned Arabic text for every voice/model comparison. Do not compare voices with different OCR text.", "", "## Commands", "", "Local installed voices:", "", f"```powershell\n{commands['localVoiceBenchmark']}\n```", "", "MOSS-TTS-Nano ONNX external benchmark:", "", f"```powershell\n{commands['mossTtsNanoOnnx']}\n```", "", "MOSS-TTS-Nano local server:", "", f"```powershell\n{commands['mossTtsNanoServer']}\n```", "", "Supertonic 3 local CPU benchmark:", "", f"```powershell\n{commands['supertonicLocal']}\n```", "", "Mishkala Tashkeel pronunciation preprocessor:", "", f"```text\n{commands['mishkalaTashkeelExternal']}\n```", "", "Mishkala local voice benchmark:", "", f"```powershell\n{commands['mishkalaVoiceBenchmark']}\n```", "", "Tashkeel-350M pronunciation preprocessor:", "", f"```text\n{commands['tashkeel350External']}\n```", "", "Tashkeel-350M local voice benchmark:", "", f"```powershell\n{commands['tashkeel350VoiceBenchmark']}\n```", "", "Plain vs Mishkala vs Tashkeel-350M listening score:", "", f"```powershell\n{commands['preprocessorListeningScore']}\n```", "", "Preprocessor promotion gate:", "", f"```powershell\n{commands['preprocessorPromotionGate']}\n```", "", "Voice listening score:", "", f"```powershell\n{commands['voiceListeningScore']}\n```", "", "Voice promotion gate:", "", f"```powershell\n{commands['voicePromotionGate']}\n```", "", "OmniVoice external benchmark:", "", f"```powershell\n{commands['omniVoiceExternal']}\n```", "", "OmniVoice Arabic LoRA external benchmark:", "", f"```text\n{commands['omniVoiceArabicLoraExternal']}\n```", "", "TADA multilingual external benchmark:", "", f"```text\n{commands['tadaExternal']}\n```", "", "Lahgtna Chatterbox external benchmark:", "", f"```powershell\n{commands['lahgtnaChatterboxExternal']}\n```", "", "NAMAA-Saudi-TTS external benchmark:", "", f"```text\n{commands['namaaSaudiTtsExternal']}\n```", "", "Saudi Chatterbox fine-tune external benchmark:", "", f"```text\n{commands['saudiChatterboxFineTuneExternal']}\n```", "", "NileTTS-XTTS Egyptian Arabic benchmark:", "", f"```text\n{commands['nileTtsExternal']}\n```", "", "Chatterbox-Multilingual external benchmark:", "", f"```text\n{commands['chatterboxMultilingualExternal']}\n```", "", "Chatterbox-Multilingual ONNX external benchmark:", "", f"```text\n{commands['chatterboxMultilingualOnnxExternal']}\n```", "", "tts-arabic-onnx external benchmark:", "", f"```text\n{commands['ttsArabicOnnxExternal']}\n```", "", "Spark-TTS Arabic external benchmark:", "", f"```text\n{commands['sparkTtsArabicExternal']}\n```", "", "Sofelia-TTS external benchmark:", "", f"```text\n{commands['sofeliaTtsExternal']}\n```", "", "Arabic-F5-TTS-v2 caution:", "", f"```text\n{commands['arabicF5TtsCaution']}\n```", "", "3arab-TTS 500M external benchmark:", "", f"```text\n{commands['threeArabTtsExternal']}\n```", "", "VoxCPM2 external benchmark:", "", f"```text\n{commands['voxcpm2External']}\n```", "", "Voxtral TTS caution:", "", f"```text\n{commands['voxtralTtsCaution']}\n```", "", "Qwen3-TTS caution:", "", f"```text\n{commands['qwen3TtsCaution']}\n```", "", "## Listening Checklist", "", "- Arabic pronunciation is clear and not robotic.", "- Pauses are comfortable for long book passages.", "- Numbers, Quranic symbols, and punctuation are not read strangely.", "- Runtime is acceptable before processing a full book.", "- Replace placeholder candidate names and licenses in promotion-gate commands before changing the production default.", ] path.parent.mkdir(parents=True, exist_ok=True) path.write_text("\n".join(lines) + "\n", encoding="utf-8") def export_tts_sample( pdf_path: Path, out_dir: Path = ROOT_DIR / "outputs" / "external-tts-sample", max_chars: int = 1200, chunk_size: int = main.CLOUD_TTS_MAX_CHARS, ocr_engine: str | None = None, from_extraction: str | None = None, env_file: Path | None = None, write_report: bool = True, ) -> dict[str, Any]: dry_run = dry_run_pdf( pdf_path, chunk_size=chunk_size, ocr_engine=ocr_engine, from_extraction=from_extraction, env_file=env_file, include_speech_text=True, speech_sample_chars=max_chars, ) if not dry_run["readyForTts"]: reasons = "; ".join(str(reason) for reason in dry_run.get("qualityReasons", [])) raise RuntimeError(f"OCR text is not ready for TTS. {reasons}".strip()) sample_text = str(dry_run["speechSampleText"]).strip() out_dir.mkdir(parents=True, exist_ok=True) text_path = out_dir / "arabic-tts-sample.txt" text_path.write_text(sample_text + "\n", encoding="utf-8") commands = build_external_commands(text_path, out_dir) result: dict[str, Any] = { "ready": True, "pdf": str(pdf_path), "textPath": str(text_path), "reportPath": str(out_dir / "external-tts-sample.md"), "characters": len(sample_text), "fullSpeechCharacters": dry_run["speechCharacters"], "arabicWords": dry_run["arabicWords"], "quality": dry_run["quality"], "qualityScore": dry_run["qualityScore"], "qualityReasons": dry_run["qualityReasons"], "ocrEngine": dry_run["ocrEngine"], "extraction": dry_run["extraction"], "commands": commands, } if write_report: write_markdown_report(out_dir / "external-tts-sample.md", result) return result def main_cli() -> None: if hasattr(sys.stdout, "reconfigure"): sys.stdout.reconfigure(encoding="utf-8", errors="replace") parser = argparse.ArgumentParser(description="Export the same cleaned Arabic text sample for external TTS benchmarking.") parser.add_argument("pdf", type=Path, help="Arabic PDF to extract a cleaned speech sample from.") parser.add_argument("--out-dir", type=Path, default=ROOT_DIR / "outputs" / "external-tts-sample") parser.add_argument("--max-chars", type=int, default=1200, help="Maximum cleaned characters to export.") parser.add_argument("--chunk-size", type=int, default=main.CLOUD_TTS_MAX_CHARS) parser.add_argument("--ocr-engine", choices=sorted(main.OCR_ENGINE_CHOICES), help="OCR engine to test.") parser.add_argument("--from-extraction", help="Apply settings from a benchmark extraction label.") parser.add_argument("--env-file", type=Path, help="Load OCR settings from a generated OCR .env snippet.") parser.add_argument("--no-report", action="store_true", help="Only write the text file.") parser.add_argument("--json", action="store_true", help="Print JSON instead of a compact summary.") args = parser.parse_args() result = export_tts_sample( args.pdf, out_dir=args.out_dir, max_chars=args.max_chars, chunk_size=args.chunk_size, ocr_engine=args.ocr_engine, from_extraction=args.from_extraction, env_file=args.env_file, write_report=not args.no_report, ) if args.json: print(json.dumps(result, ensure_ascii=False, indent=2)) else: print(f"Wrote Arabic TTS sample: {result['textPath']}") if not args.no_report: print(f"Wrote benchmark handoff: {result['reportPath']}") if __name__ == "__main__": main_cli()