| from __future__ import annotations |
|
|
| import argparse |
| import json |
| import sys |
| from pathlib import Path |
| from typing import Any |
|
|
| ROOT_DIR = Path(__file__).resolve().parent.parent |
| if str(ROOT_DIR) not in sys.path: |
| sys.path.insert(0, str(ROOT_DIR)) |
|
|
| from app import main |
| from scripts.dry_run_pdf import dry_run_pdf |
|
|
|
|
| def safe_command_path(path: Path) -> str: |
| text = str(path) |
| return f'"{text}"' if " " in text else text |
|
|
|
|
| def build_external_commands(text_path: Path, output_dir: Path) -> dict[str, str]: |
| text_arg = safe_command_path(text_path) |
| output_arg = safe_command_path(output_dir) |
| return { |
| "localVoiceBenchmark": ( |
| f"python scripts\\benchmark_voices.py --text-file {text_arg} " |
| f"--out-dir {output_arg}\\local-voices --write-report {output_arg}\\local-voices.md" |
| ), |
| "mossTtsNanoOnnx": ( |
| "moss-tts-nano generate --backend onnx --language ar " |
| f"--text-file {text_arg} --prompt-speech C:\\path\\to\\arabic-reference.wav" |
| ), |
| "mossTtsNanoServer": "moss-tts-nano serve --backend onnx", |
| "supertonicLocal": ( |
| f"python scripts\\benchmark_voices.py --voices supertonic-ar --text-file {text_arg} " |
| f"--out-dir {output_arg}\\supertonic --write-report {output_arg}\\supertonic.md" |
| ), |
| "mishkalaTashkeelExternal": ( |
| "Diacritize the same cleaned sample with flokymind/mishkala and save it beside the plain sample, " |
| f"for example {output_arg}\\arabic-tts-sample-mishkala.txt. Then synthesize both files with the same voice." |
| ), |
| "mishkalaVoiceBenchmark": ( |
| f"python scripts\\benchmark_voices.py --text-file {output_arg}\\arabic-tts-sample-mishkala.txt " |
| f"--out-dir {output_arg}\\mishkala-local-voices --write-report {output_arg}\\mishkala-local-voices.md" |
| ), |
| "tashkeel350External": ( |
| "Diacritize the same cleaned sample with Etherll/Tashkeel-350M and save it beside the plain sample, " |
| f"for example {output_arg}\\arabic-tts-sample-tashkeel350.txt. Then synthesize plain, Mishkala, " |
| "and Tashkeel-350M samples with the same voice before choosing a preprocessor." |
| ), |
| "tashkeel350VoiceBenchmark": ( |
| f"python scripts\\benchmark_voices.py --text-file {output_arg}\\arabic-tts-sample-tashkeel350.txt " |
| f"--out-dir {output_arg}\\tashkeel350-local-voices --write-report {output_arg}\\tashkeel350-local-voices.md" |
| ), |
| "preprocessorListeningScore": ( |
| "python scripts\\score_tts_preprocessor.py " |
| "--rating plain=4,5,5,4,4 --rating mishkala=5,4,4,4,4 --rating tashkeel350=5,4,4,4,4 " |
| f"--write-report {output_arg}\\tts-preprocessor-score.md " |
| f"--write-json {output_arg}\\tts-preprocessor-score.json" |
| ), |
| "voiceListeningScore": ( |
| "python scripts\\score_voice_listening.py " |
| "--rating silma-local=5,4,4,5,5 --rating espeak-ar-clear=3,2,4,3,5 " |
| f"--write-report {output_arg}\\voice-listening-score.md " |
| f"--write-json {output_arg}\\voice-listening-score.json" |
| ), |
| "voicePromotionGate": ( |
| "python scripts\\model_promotion_gate.py " |
| "--candidate-name \"External voice winner\" --kind tts --license Apache-2.0 " |
| f"--score-json {output_arg}\\voice-listening-score.json " |
| "--same-sample --runtime-ok --privacy-ok --human-reviewed " |
| f"--write-report {output_arg}\\voice-promotion-gate.md" |
| ), |
| "preprocessorPromotionGate": ( |
| "python scripts\\model_promotion_gate.py " |
| "--candidate-name \"TTS preprocessor winner\" --kind preprocessor --license Apache-2.0 " |
| f"--score-json {output_arg}\\tts-preprocessor-score.json " |
| "--same-sample --runtime-ok --privacy-ok --human-reviewed " |
| f"--write-report {output_arg}\\preprocessor-promotion-gate.md" |
| ), |
| "omniVoiceExternal": ( |
| "python -m omnivoice.cli " |
| f"--model k2-fsa/OmniVoice --language ar --text-file {text_arg} " |
| "--ref-audio C:\\path\\to\\arabic-reference.wav --ref-text \"Arabic reference transcript\"" |
| ), |
| "omniVoiceArabicLoraExternal": ( |
| "Run OmniVoice with the Arabic LoRA adapter vivooglobal/omnivoice-lora-ar on the same text and " |
| "reference audio after the base OmniVoice benchmark works." |
| ), |
| "tadaExternal": ( |
| "Benchmark HumeAI/tada-3b-ml externally with language=\"ar\" on the same cleaned sample " |
| f"{text_arg}. It is designed to reduce off-script speech, but it reports the Llama 3.2 " |
| "license and is a 3B-class strong-worker path, so keep it outside the permissive default." |
| ), |
| "lahgtnaChatterboxExternal": ( |
| "python -m chatterbox.tts " |
| f"--model oddadmix/lahgtna-chatterbox-v1 --text-file {text_arg} " |
| "--audio-prompt-path C:\\path\\to\\arabic-reference.wav --repetition-penalty 1.25" |
| ), |
| "namaaSaudiTtsExternal": ( |
| "Use ChatterboxMultilingualTTS with the NAMAA-Space/NAMAA-Saudi-TTS safetensors on " |
| f"{text_arg}, language_id=\"ar\", and the same optional reference audio. Benchmark only for " |
| "Saudi/Gulf dialect fit, then compare against SILMA, Habibi, Saudi Arabic Qwen3-TTS, and Emirati voices." |
| ), |
| "saudiChatterboxFineTuneExternal": ( |
| "Use ChatterboxMultilingualTTS with FatimahEmadEldin/saudi-tts-chatterbox-finetuned T3 weights on " |
| f"{text_arg}, language_id=\"ar\", and the same optional reference audio. Benchmark only for " |
| "Saudi/Gulf dialect fit, then compare against NAMAA-Saudi-TTS, SILMA, Habibi, Saudi Arabic Qwen3-TTS, " |
| "and Emirati voices." |
| ), |
| "nileTtsExternal": ( |
| "Benchmark KickItLikeShika/NileTTS-XTTS only for Egyptian/dialectal Arabic using this same cleaned " |
| f"sample {text_arg}. It is Apache-2.0, but not an MSA book default." |
| ), |
| "chatterboxMultilingualExternal": ( |
| "Use ChatterboxMultilingualTTS.from_pretrained(...).generate(text, language_id=\"ar\", " |
| "audio_prompt_path=\"C:\\path\\to\\arabic-reference.wav\") against " |
| f"{text_arg}; compare pacing and pronunciation against SILMA and Habibi." |
| ), |
| "chatterboxMultilingualOnnxExternal": ( |
| "Benchmark onnx-community/chatterbox-multilingual-ONNX with language_id=\"ar\" against " |
| f"{text_arg}; compare CPU/ONNX runtime, repetition, pacing, and pronunciation against SILMA, " |
| "Habibi, and the regular Chatterbox-Multilingual path." |
| ), |
| "ttsArabicOnnxExternal": ( |
| "Benchmark nipponjo/tts-arabic-onnx with the same cleaned sample " |
| f"{text_arg}; try FastPitch, MixerTTS, speaker IDs, pace, and vowelizer options, then compare " |
| "CPU runtime and pronunciation against SILMA, Supertonic, MOSS-TTS-Nano, and Chatterbox ONNX. " |
| "Confirm model/repo licensing before production use." |
| ), |
| "sparkTtsArabicExternal": ( |
| "Spark-TTS Arabic requires the Spark-TTS repo plus diacritized Arabic/reference audio; benchmark it " |
| f"externally with {text_arg} only after preparing that reference workflow." |
| ), |
| "sofeliaTtsExternal": ( |
| "Sofelia-TTS is a Palestinian Arabic/MiraTTS voice-cloning path; benchmark it only for dialectal text " |
| f"using the same sample {text_arg}." |
| ), |
| "arabicF5TtsCaution": ( |
| "Arabic-F5-TTS-v2 is non-commercial and requires fully diacritized Arabic; keep it to personal " |
| "experiments unless that license and input requirement are acceptable." |
| ), |
| "threeArabTtsExternal": ( |
| "Benchmark sherif1313/3arab-TTS-500M-v1 and the VoiceDesign variant on this same cleaned Arabic " |
| f"sample: {text_arg}. It is Apache-2.0 and Arabic-only, but new enough that listenability and " |
| "long-form stability need manual checks before app wiring." |
| ), |
| "voxcpm2External": f"Use {text_arg} as the exact Arabic text sample when testing VoxCPM2 externally.", |
| "voxtralTtsCaution": ( |
| "Voxtral TTS supports Arabic on its model card, but it is CC-BY-NC-4.0 and GPU-heavy; " |
| f"use {text_arg} only for personal/non-commercial strong-worker listening comparisons." |
| ), |
| "qwen3TtsCaution": ( |
| "Do not promote Qwen3-TTS for this Arabic reader until an official Arabic-capable checkpoint " |
| "or Arabic fine-tune is verified on this same sample." |
| ), |
| } |
|
|
|
|
| def write_markdown_report(path: Path, result: dict[str, Any]) -> None: |
| commands = result["commands"] |
| lines = [ |
| "# External Arabic TTS Sample", |
| "", |
| f"PDF: `{result.get('pdf', '-')}`", |
| f"Text file: `{result['textPath']}`", |
| f"Characters: {result['characters']}", |
| f"Arabic words: {result['arabicWords']}", |
| f"OCR extraction: `{result.get('extraction', '-')}`", |
| f"Quality: `{result['quality']}`", |
| "", |
| "Use this same cleaned Arabic text for every voice/model comparison. Do not compare voices with different OCR text.", |
| "", |
| "## Commands", |
| "", |
| "Local installed voices:", |
| "", |
| f"```powershell\n{commands['localVoiceBenchmark']}\n```", |
| "", |
| "MOSS-TTS-Nano ONNX external benchmark:", |
| "", |
| f"```powershell\n{commands['mossTtsNanoOnnx']}\n```", |
| "", |
| "MOSS-TTS-Nano local server:", |
| "", |
| f"```powershell\n{commands['mossTtsNanoServer']}\n```", |
| "", |
| "Supertonic 3 local CPU benchmark:", |
| "", |
| f"```powershell\n{commands['supertonicLocal']}\n```", |
| "", |
| "Mishkala Tashkeel pronunciation preprocessor:", |
| "", |
| f"```text\n{commands['mishkalaTashkeelExternal']}\n```", |
| "", |
| "Mishkala local voice benchmark:", |
| "", |
| f"```powershell\n{commands['mishkalaVoiceBenchmark']}\n```", |
| "", |
| "Tashkeel-350M pronunciation preprocessor:", |
| "", |
| f"```text\n{commands['tashkeel350External']}\n```", |
| "", |
| "Tashkeel-350M local voice benchmark:", |
| "", |
| f"```powershell\n{commands['tashkeel350VoiceBenchmark']}\n```", |
| "", |
| "Plain vs Mishkala vs Tashkeel-350M listening score:", |
| "", |
| f"```powershell\n{commands['preprocessorListeningScore']}\n```", |
| "", |
| "Preprocessor promotion gate:", |
| "", |
| f"```powershell\n{commands['preprocessorPromotionGate']}\n```", |
| "", |
| "Voice listening score:", |
| "", |
| f"```powershell\n{commands['voiceListeningScore']}\n```", |
| "", |
| "Voice promotion gate:", |
| "", |
| f"```powershell\n{commands['voicePromotionGate']}\n```", |
| "", |
| "OmniVoice external benchmark:", |
| "", |
| f"```powershell\n{commands['omniVoiceExternal']}\n```", |
| "", |
| "OmniVoice Arabic LoRA external benchmark:", |
| "", |
| f"```text\n{commands['omniVoiceArabicLoraExternal']}\n```", |
| "", |
| "TADA multilingual external benchmark:", |
| "", |
| f"```text\n{commands['tadaExternal']}\n```", |
| "", |
| "Lahgtna Chatterbox external benchmark:", |
| "", |
| f"```powershell\n{commands['lahgtnaChatterboxExternal']}\n```", |
| "", |
| "NAMAA-Saudi-TTS external benchmark:", |
| "", |
| f"```text\n{commands['namaaSaudiTtsExternal']}\n```", |
| "", |
| "Saudi Chatterbox fine-tune external benchmark:", |
| "", |
| f"```text\n{commands['saudiChatterboxFineTuneExternal']}\n```", |
| "", |
| "NileTTS-XTTS Egyptian Arabic benchmark:", |
| "", |
| f"```text\n{commands['nileTtsExternal']}\n```", |
| "", |
| "Chatterbox-Multilingual external benchmark:", |
| "", |
| f"```text\n{commands['chatterboxMultilingualExternal']}\n```", |
| "", |
| "Chatterbox-Multilingual ONNX external benchmark:", |
| "", |
| f"```text\n{commands['chatterboxMultilingualOnnxExternal']}\n```", |
| "", |
| "tts-arabic-onnx external benchmark:", |
| "", |
| f"```text\n{commands['ttsArabicOnnxExternal']}\n```", |
| "", |
| "Spark-TTS Arabic external benchmark:", |
| "", |
| f"```text\n{commands['sparkTtsArabicExternal']}\n```", |
| "", |
| "Sofelia-TTS external benchmark:", |
| "", |
| f"```text\n{commands['sofeliaTtsExternal']}\n```", |
| "", |
| "Arabic-F5-TTS-v2 caution:", |
| "", |
| f"```text\n{commands['arabicF5TtsCaution']}\n```", |
| "", |
| "3arab-TTS 500M external benchmark:", |
| "", |
| f"```text\n{commands['threeArabTtsExternal']}\n```", |
| "", |
| "VoxCPM2 external benchmark:", |
| "", |
| f"```text\n{commands['voxcpm2External']}\n```", |
| "", |
| "Voxtral TTS caution:", |
| "", |
| f"```text\n{commands['voxtralTtsCaution']}\n```", |
| "", |
| "Qwen3-TTS caution:", |
| "", |
| f"```text\n{commands['qwen3TtsCaution']}\n```", |
| "", |
| "## Listening Checklist", |
| "", |
| "- Arabic pronunciation is clear and not robotic.", |
| "- Pauses are comfortable for long book passages.", |
| "- Numbers, Quranic symbols, and punctuation are not read strangely.", |
| "- Runtime is acceptable before processing a full book.", |
| "- Replace placeholder candidate names and licenses in promotion-gate commands before changing the production default.", |
| ] |
| path.parent.mkdir(parents=True, exist_ok=True) |
| path.write_text("\n".join(lines) + "\n", encoding="utf-8") |
|
|
|
|
| def export_tts_sample( |
| pdf_path: Path, |
| out_dir: Path = ROOT_DIR / "outputs" / "external-tts-sample", |
| max_chars: int = 1200, |
| chunk_size: int = main.CLOUD_TTS_MAX_CHARS, |
| ocr_engine: str | None = None, |
| from_extraction: str | None = None, |
| env_file: Path | None = None, |
| write_report: bool = True, |
| ) -> dict[str, Any]: |
| dry_run = dry_run_pdf( |
| pdf_path, |
| chunk_size=chunk_size, |
| ocr_engine=ocr_engine, |
| from_extraction=from_extraction, |
| env_file=env_file, |
| include_speech_text=True, |
| speech_sample_chars=max_chars, |
| ) |
| if not dry_run["readyForTts"]: |
| reasons = "; ".join(str(reason) for reason in dry_run.get("qualityReasons", [])) |
| raise RuntimeError(f"OCR text is not ready for TTS. {reasons}".strip()) |
|
|
| sample_text = str(dry_run["speechSampleText"]).strip() |
| out_dir.mkdir(parents=True, exist_ok=True) |
| text_path = out_dir / "arabic-tts-sample.txt" |
| text_path.write_text(sample_text + "\n", encoding="utf-8") |
| commands = build_external_commands(text_path, out_dir) |
| result: dict[str, Any] = { |
| "ready": True, |
| "pdf": str(pdf_path), |
| "textPath": str(text_path), |
| "reportPath": str(out_dir / "external-tts-sample.md"), |
| "characters": len(sample_text), |
| "fullSpeechCharacters": dry_run["speechCharacters"], |
| "arabicWords": dry_run["arabicWords"], |
| "quality": dry_run["quality"], |
| "qualityScore": dry_run["qualityScore"], |
| "qualityReasons": dry_run["qualityReasons"], |
| "ocrEngine": dry_run["ocrEngine"], |
| "extraction": dry_run["extraction"], |
| "commands": commands, |
| } |
| if write_report: |
| write_markdown_report(out_dir / "external-tts-sample.md", result) |
| return result |
|
|
|
|
| def main_cli() -> None: |
| if hasattr(sys.stdout, "reconfigure"): |
| sys.stdout.reconfigure(encoding="utf-8", errors="replace") |
| parser = argparse.ArgumentParser(description="Export the same cleaned Arabic text sample for external TTS benchmarking.") |
| parser.add_argument("pdf", type=Path, help="Arabic PDF to extract a cleaned speech sample from.") |
| parser.add_argument("--out-dir", type=Path, default=ROOT_DIR / "outputs" / "external-tts-sample") |
| parser.add_argument("--max-chars", type=int, default=1200, help="Maximum cleaned characters to export.") |
| parser.add_argument("--chunk-size", type=int, default=main.CLOUD_TTS_MAX_CHARS) |
| parser.add_argument("--ocr-engine", choices=sorted(main.OCR_ENGINE_CHOICES), help="OCR engine to test.") |
| parser.add_argument("--from-extraction", help="Apply settings from a benchmark extraction label.") |
| parser.add_argument("--env-file", type=Path, help="Load OCR settings from a generated OCR .env snippet.") |
| parser.add_argument("--no-report", action="store_true", help="Only write the text file.") |
| parser.add_argument("--json", action="store_true", help="Print JSON instead of a compact summary.") |
| args = parser.parse_args() |
|
|
| result = export_tts_sample( |
| args.pdf, |
| out_dir=args.out_dir, |
| max_chars=args.max_chars, |
| chunk_size=args.chunk_size, |
| ocr_engine=args.ocr_engine, |
| from_extraction=args.from_extraction, |
| env_file=args.env_file, |
| write_report=not args.no_report, |
| ) |
| if args.json: |
| print(json.dumps(result, ensure_ascii=False, indent=2)) |
| else: |
| print(f"Wrote Arabic TTS sample: {result['textPath']}") |
| if not args.no_report: |
| print(f"Wrote benchmark handoff: {result['reportPath']}") |
|
|
|
|
| if __name__ == "__main__": |
| main_cli() |
|
|