arabic-audio-reader-worker / scripts /export_tts_sample.py
Syncre's picture
Deploy Arabic Audio Reader worker
2e1a095 verified
from __future__ import annotations
import argparse
import json
import sys
from pathlib import Path
from typing import Any
ROOT_DIR = Path(__file__).resolve().parent.parent
if str(ROOT_DIR) not in sys.path:
sys.path.insert(0, str(ROOT_DIR))
from app import main
from scripts.dry_run_pdf import dry_run_pdf
def safe_command_path(path: Path) -> str:
text = str(path)
return f'"{text}"' if " " in text else text
def build_external_commands(text_path: Path, output_dir: Path) -> dict[str, str]:
text_arg = safe_command_path(text_path)
output_arg = safe_command_path(output_dir)
return {
"localVoiceBenchmark": (
f"python scripts\\benchmark_voices.py --text-file {text_arg} "
f"--out-dir {output_arg}\\local-voices --write-report {output_arg}\\local-voices.md"
),
"mossTtsNanoOnnx": (
"moss-tts-nano generate --backend onnx --language ar "
f"--text-file {text_arg} --prompt-speech C:\\path\\to\\arabic-reference.wav"
),
"mossTtsNanoServer": "moss-tts-nano serve --backend onnx",
"supertonicLocal": (
f"python scripts\\benchmark_voices.py --voices supertonic-ar --text-file {text_arg} "
f"--out-dir {output_arg}\\supertonic --write-report {output_arg}\\supertonic.md"
),
"mishkalaTashkeelExternal": (
"Diacritize the same cleaned sample with flokymind/mishkala and save it beside the plain sample, "
f"for example {output_arg}\\arabic-tts-sample-mishkala.txt. Then synthesize both files with the same voice."
),
"mishkalaVoiceBenchmark": (
f"python scripts\\benchmark_voices.py --text-file {output_arg}\\arabic-tts-sample-mishkala.txt "
f"--out-dir {output_arg}\\mishkala-local-voices --write-report {output_arg}\\mishkala-local-voices.md"
),
"tashkeel350External": (
"Diacritize the same cleaned sample with Etherll/Tashkeel-350M and save it beside the plain sample, "
f"for example {output_arg}\\arabic-tts-sample-tashkeel350.txt. Then synthesize plain, Mishkala, "
"and Tashkeel-350M samples with the same voice before choosing a preprocessor."
),
"tashkeel350VoiceBenchmark": (
f"python scripts\\benchmark_voices.py --text-file {output_arg}\\arabic-tts-sample-tashkeel350.txt "
f"--out-dir {output_arg}\\tashkeel350-local-voices --write-report {output_arg}\\tashkeel350-local-voices.md"
),
"preprocessorListeningScore": (
"python scripts\\score_tts_preprocessor.py "
"--rating plain=4,5,5,4,4 --rating mishkala=5,4,4,4,4 --rating tashkeel350=5,4,4,4,4 "
f"--write-report {output_arg}\\tts-preprocessor-score.md "
f"--write-json {output_arg}\\tts-preprocessor-score.json"
),
"voiceListeningScore": (
"python scripts\\score_voice_listening.py "
"--rating silma-local=5,4,4,5,5 --rating espeak-ar-clear=3,2,4,3,5 "
f"--write-report {output_arg}\\voice-listening-score.md "
f"--write-json {output_arg}\\voice-listening-score.json"
),
"voicePromotionGate": (
"python scripts\\model_promotion_gate.py "
"--candidate-name \"External voice winner\" --kind tts --license Apache-2.0 "
f"--score-json {output_arg}\\voice-listening-score.json "
"--same-sample --runtime-ok --privacy-ok --human-reviewed "
f"--write-report {output_arg}\\voice-promotion-gate.md"
),
"preprocessorPromotionGate": (
"python scripts\\model_promotion_gate.py "
"--candidate-name \"TTS preprocessor winner\" --kind preprocessor --license Apache-2.0 "
f"--score-json {output_arg}\\tts-preprocessor-score.json "
"--same-sample --runtime-ok --privacy-ok --human-reviewed "
f"--write-report {output_arg}\\preprocessor-promotion-gate.md"
),
"omniVoiceExternal": (
"python -m omnivoice.cli "
f"--model k2-fsa/OmniVoice --language ar --text-file {text_arg} "
"--ref-audio C:\\path\\to\\arabic-reference.wav --ref-text \"Arabic reference transcript\""
),
"omniVoiceArabicLoraExternal": (
"Run OmniVoice with the Arabic LoRA adapter vivooglobal/omnivoice-lora-ar on the same text and "
"reference audio after the base OmniVoice benchmark works."
),
"tadaExternal": (
"Benchmark HumeAI/tada-3b-ml externally with language=\"ar\" on the same cleaned sample "
f"{text_arg}. It is designed to reduce off-script speech, but it reports the Llama 3.2 "
"license and is a 3B-class strong-worker path, so keep it outside the permissive default."
),
"lahgtnaChatterboxExternal": (
"python -m chatterbox.tts "
f"--model oddadmix/lahgtna-chatterbox-v1 --text-file {text_arg} "
"--audio-prompt-path C:\\path\\to\\arabic-reference.wav --repetition-penalty 1.25"
),
"namaaSaudiTtsExternal": (
"Use ChatterboxMultilingualTTS with the NAMAA-Space/NAMAA-Saudi-TTS safetensors on "
f"{text_arg}, language_id=\"ar\", and the same optional reference audio. Benchmark only for "
"Saudi/Gulf dialect fit, then compare against SILMA, Habibi, Saudi Arabic Qwen3-TTS, and Emirati voices."
),
"saudiChatterboxFineTuneExternal": (
"Use ChatterboxMultilingualTTS with FatimahEmadEldin/saudi-tts-chatterbox-finetuned T3 weights on "
f"{text_arg}, language_id=\"ar\", and the same optional reference audio. Benchmark only for "
"Saudi/Gulf dialect fit, then compare against NAMAA-Saudi-TTS, SILMA, Habibi, Saudi Arabic Qwen3-TTS, "
"and Emirati voices."
),
"nileTtsExternal": (
"Benchmark KickItLikeShika/NileTTS-XTTS only for Egyptian/dialectal Arabic using this same cleaned "
f"sample {text_arg}. It is Apache-2.0, but not an MSA book default."
),
"chatterboxMultilingualExternal": (
"Use ChatterboxMultilingualTTS.from_pretrained(...).generate(text, language_id=\"ar\", "
"audio_prompt_path=\"C:\\path\\to\\arabic-reference.wav\") against "
f"{text_arg}; compare pacing and pronunciation against SILMA and Habibi."
),
"chatterboxMultilingualOnnxExternal": (
"Benchmark onnx-community/chatterbox-multilingual-ONNX with language_id=\"ar\" against "
f"{text_arg}; compare CPU/ONNX runtime, repetition, pacing, and pronunciation against SILMA, "
"Habibi, and the regular Chatterbox-Multilingual path."
),
"ttsArabicOnnxExternal": (
"Benchmark nipponjo/tts-arabic-onnx with the same cleaned sample "
f"{text_arg}; try FastPitch, MixerTTS, speaker IDs, pace, and vowelizer options, then compare "
"CPU runtime and pronunciation against SILMA, Supertonic, MOSS-TTS-Nano, and Chatterbox ONNX. "
"Confirm model/repo licensing before production use."
),
"sparkTtsArabicExternal": (
"Spark-TTS Arabic requires the Spark-TTS repo plus diacritized Arabic/reference audio; benchmark it "
f"externally with {text_arg} only after preparing that reference workflow."
),
"sofeliaTtsExternal": (
"Sofelia-TTS is a Palestinian Arabic/MiraTTS voice-cloning path; benchmark it only for dialectal text "
f"using the same sample {text_arg}."
),
"arabicF5TtsCaution": (
"Arabic-F5-TTS-v2 is non-commercial and requires fully diacritized Arabic; keep it to personal "
"experiments unless that license and input requirement are acceptable."
),
"threeArabTtsExternal": (
"Benchmark sherif1313/3arab-TTS-500M-v1 and the VoiceDesign variant on this same cleaned Arabic "
f"sample: {text_arg}. It is Apache-2.0 and Arabic-only, but new enough that listenability and "
"long-form stability need manual checks before app wiring."
),
"voxcpm2External": f"Use {text_arg} as the exact Arabic text sample when testing VoxCPM2 externally.",
"voxtralTtsCaution": (
"Voxtral TTS supports Arabic on its model card, but it is CC-BY-NC-4.0 and GPU-heavy; "
f"use {text_arg} only for personal/non-commercial strong-worker listening comparisons."
),
"qwen3TtsCaution": (
"Do not promote Qwen3-TTS for this Arabic reader until an official Arabic-capable checkpoint "
"or Arabic fine-tune is verified on this same sample."
),
}
def write_markdown_report(path: Path, result: dict[str, Any]) -> None:
commands = result["commands"]
lines = [
"# External Arabic TTS Sample",
"",
f"PDF: `{result.get('pdf', '-')}`",
f"Text file: `{result['textPath']}`",
f"Characters: {result['characters']}",
f"Arabic words: {result['arabicWords']}",
f"OCR extraction: `{result.get('extraction', '-')}`",
f"Quality: `{result['quality']}`",
"",
"Use this same cleaned Arabic text for every voice/model comparison. Do not compare voices with different OCR text.",
"",
"## Commands",
"",
"Local installed voices:",
"",
f"```powershell\n{commands['localVoiceBenchmark']}\n```",
"",
"MOSS-TTS-Nano ONNX external benchmark:",
"",
f"```powershell\n{commands['mossTtsNanoOnnx']}\n```",
"",
"MOSS-TTS-Nano local server:",
"",
f"```powershell\n{commands['mossTtsNanoServer']}\n```",
"",
"Supertonic 3 local CPU benchmark:",
"",
f"```powershell\n{commands['supertonicLocal']}\n```",
"",
"Mishkala Tashkeel pronunciation preprocessor:",
"",
f"```text\n{commands['mishkalaTashkeelExternal']}\n```",
"",
"Mishkala local voice benchmark:",
"",
f"```powershell\n{commands['mishkalaVoiceBenchmark']}\n```",
"",
"Tashkeel-350M pronunciation preprocessor:",
"",
f"```text\n{commands['tashkeel350External']}\n```",
"",
"Tashkeel-350M local voice benchmark:",
"",
f"```powershell\n{commands['tashkeel350VoiceBenchmark']}\n```",
"",
"Plain vs Mishkala vs Tashkeel-350M listening score:",
"",
f"```powershell\n{commands['preprocessorListeningScore']}\n```",
"",
"Preprocessor promotion gate:",
"",
f"```powershell\n{commands['preprocessorPromotionGate']}\n```",
"",
"Voice listening score:",
"",
f"```powershell\n{commands['voiceListeningScore']}\n```",
"",
"Voice promotion gate:",
"",
f"```powershell\n{commands['voicePromotionGate']}\n```",
"",
"OmniVoice external benchmark:",
"",
f"```powershell\n{commands['omniVoiceExternal']}\n```",
"",
"OmniVoice Arabic LoRA external benchmark:",
"",
f"```text\n{commands['omniVoiceArabicLoraExternal']}\n```",
"",
"TADA multilingual external benchmark:",
"",
f"```text\n{commands['tadaExternal']}\n```",
"",
"Lahgtna Chatterbox external benchmark:",
"",
f"```powershell\n{commands['lahgtnaChatterboxExternal']}\n```",
"",
"NAMAA-Saudi-TTS external benchmark:",
"",
f"```text\n{commands['namaaSaudiTtsExternal']}\n```",
"",
"Saudi Chatterbox fine-tune external benchmark:",
"",
f"```text\n{commands['saudiChatterboxFineTuneExternal']}\n```",
"",
"NileTTS-XTTS Egyptian Arabic benchmark:",
"",
f"```text\n{commands['nileTtsExternal']}\n```",
"",
"Chatterbox-Multilingual external benchmark:",
"",
f"```text\n{commands['chatterboxMultilingualExternal']}\n```",
"",
"Chatterbox-Multilingual ONNX external benchmark:",
"",
f"```text\n{commands['chatterboxMultilingualOnnxExternal']}\n```",
"",
"tts-arabic-onnx external benchmark:",
"",
f"```text\n{commands['ttsArabicOnnxExternal']}\n```",
"",
"Spark-TTS Arabic external benchmark:",
"",
f"```text\n{commands['sparkTtsArabicExternal']}\n```",
"",
"Sofelia-TTS external benchmark:",
"",
f"```text\n{commands['sofeliaTtsExternal']}\n```",
"",
"Arabic-F5-TTS-v2 caution:",
"",
f"```text\n{commands['arabicF5TtsCaution']}\n```",
"",
"3arab-TTS 500M external benchmark:",
"",
f"```text\n{commands['threeArabTtsExternal']}\n```",
"",
"VoxCPM2 external benchmark:",
"",
f"```text\n{commands['voxcpm2External']}\n```",
"",
"Voxtral TTS caution:",
"",
f"```text\n{commands['voxtralTtsCaution']}\n```",
"",
"Qwen3-TTS caution:",
"",
f"```text\n{commands['qwen3TtsCaution']}\n```",
"",
"## Listening Checklist",
"",
"- Arabic pronunciation is clear and not robotic.",
"- Pauses are comfortable for long book passages.",
"- Numbers, Quranic symbols, and punctuation are not read strangely.",
"- Runtime is acceptable before processing a full book.",
"- Replace placeholder candidate names and licenses in promotion-gate commands before changing the production default.",
]
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text("\n".join(lines) + "\n", encoding="utf-8")
def export_tts_sample(
pdf_path: Path,
out_dir: Path = ROOT_DIR / "outputs" / "external-tts-sample",
max_chars: int = 1200,
chunk_size: int = main.CLOUD_TTS_MAX_CHARS,
ocr_engine: str | None = None,
from_extraction: str | None = None,
env_file: Path | None = None,
write_report: bool = True,
) -> dict[str, Any]:
dry_run = dry_run_pdf(
pdf_path,
chunk_size=chunk_size,
ocr_engine=ocr_engine,
from_extraction=from_extraction,
env_file=env_file,
include_speech_text=True,
speech_sample_chars=max_chars,
)
if not dry_run["readyForTts"]:
reasons = "; ".join(str(reason) for reason in dry_run.get("qualityReasons", []))
raise RuntimeError(f"OCR text is not ready for TTS. {reasons}".strip())
sample_text = str(dry_run["speechSampleText"]).strip()
out_dir.mkdir(parents=True, exist_ok=True)
text_path = out_dir / "arabic-tts-sample.txt"
text_path.write_text(sample_text + "\n", encoding="utf-8")
commands = build_external_commands(text_path, out_dir)
result: dict[str, Any] = {
"ready": True,
"pdf": str(pdf_path),
"textPath": str(text_path),
"reportPath": str(out_dir / "external-tts-sample.md"),
"characters": len(sample_text),
"fullSpeechCharacters": dry_run["speechCharacters"],
"arabicWords": dry_run["arabicWords"],
"quality": dry_run["quality"],
"qualityScore": dry_run["qualityScore"],
"qualityReasons": dry_run["qualityReasons"],
"ocrEngine": dry_run["ocrEngine"],
"extraction": dry_run["extraction"],
"commands": commands,
}
if write_report:
write_markdown_report(out_dir / "external-tts-sample.md", result)
return result
def main_cli() -> None:
if hasattr(sys.stdout, "reconfigure"):
sys.stdout.reconfigure(encoding="utf-8", errors="replace")
parser = argparse.ArgumentParser(description="Export the same cleaned Arabic text sample for external TTS benchmarking.")
parser.add_argument("pdf", type=Path, help="Arabic PDF to extract a cleaned speech sample from.")
parser.add_argument("--out-dir", type=Path, default=ROOT_DIR / "outputs" / "external-tts-sample")
parser.add_argument("--max-chars", type=int, default=1200, help="Maximum cleaned characters to export.")
parser.add_argument("--chunk-size", type=int, default=main.CLOUD_TTS_MAX_CHARS)
parser.add_argument("--ocr-engine", choices=sorted(main.OCR_ENGINE_CHOICES), help="OCR engine to test.")
parser.add_argument("--from-extraction", help="Apply settings from a benchmark extraction label.")
parser.add_argument("--env-file", type=Path, help="Load OCR settings from a generated OCR .env snippet.")
parser.add_argument("--no-report", action="store_true", help="Only write the text file.")
parser.add_argument("--json", action="store_true", help="Print JSON instead of a compact summary.")
args = parser.parse_args()
result = export_tts_sample(
args.pdf,
out_dir=args.out_dir,
max_chars=args.max_chars,
chunk_size=args.chunk_size,
ocr_engine=args.ocr_engine,
from_extraction=args.from_extraction,
env_file=args.env_file,
write_report=not args.no_report,
)
if args.json:
print(json.dumps(result, ensure_ascii=False, indent=2))
else:
print(f"Wrote Arabic TTS sample: {result['textPath']}")
if not args.no_report:
print(f"Wrote benchmark handoff: {result['reportPath']}")
if __name__ == "__main__":
main_cli()