File size: 17,512 Bytes
2e1a095 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 | from __future__ import annotations
import argparse
import json
import sys
from pathlib import Path
from typing import Any
ROOT_DIR = Path(__file__).resolve().parent.parent
if str(ROOT_DIR) not in sys.path:
sys.path.insert(0, str(ROOT_DIR))
from app import main
from scripts.dry_run_pdf import dry_run_pdf
def safe_command_path(path: Path) -> str:
text = str(path)
return f'"{text}"' if " " in text else text
def build_external_commands(text_path: Path, output_dir: Path) -> dict[str, str]:
text_arg = safe_command_path(text_path)
output_arg = safe_command_path(output_dir)
return {
"localVoiceBenchmark": (
f"python scripts\\benchmark_voices.py --text-file {text_arg} "
f"--out-dir {output_arg}\\local-voices --write-report {output_arg}\\local-voices.md"
),
"mossTtsNanoOnnx": (
"moss-tts-nano generate --backend onnx --language ar "
f"--text-file {text_arg} --prompt-speech C:\\path\\to\\arabic-reference.wav"
),
"mossTtsNanoServer": "moss-tts-nano serve --backend onnx",
"supertonicLocal": (
f"python scripts\\benchmark_voices.py --voices supertonic-ar --text-file {text_arg} "
f"--out-dir {output_arg}\\supertonic --write-report {output_arg}\\supertonic.md"
),
"mishkalaTashkeelExternal": (
"Diacritize the same cleaned sample with flokymind/mishkala and save it beside the plain sample, "
f"for example {output_arg}\\arabic-tts-sample-mishkala.txt. Then synthesize both files with the same voice."
),
"mishkalaVoiceBenchmark": (
f"python scripts\\benchmark_voices.py --text-file {output_arg}\\arabic-tts-sample-mishkala.txt "
f"--out-dir {output_arg}\\mishkala-local-voices --write-report {output_arg}\\mishkala-local-voices.md"
),
"tashkeel350External": (
"Diacritize the same cleaned sample with Etherll/Tashkeel-350M and save it beside the plain sample, "
f"for example {output_arg}\\arabic-tts-sample-tashkeel350.txt. Then synthesize plain, Mishkala, "
"and Tashkeel-350M samples with the same voice before choosing a preprocessor."
),
"tashkeel350VoiceBenchmark": (
f"python scripts\\benchmark_voices.py --text-file {output_arg}\\arabic-tts-sample-tashkeel350.txt "
f"--out-dir {output_arg}\\tashkeel350-local-voices --write-report {output_arg}\\tashkeel350-local-voices.md"
),
"preprocessorListeningScore": (
"python scripts\\score_tts_preprocessor.py "
"--rating plain=4,5,5,4,4 --rating mishkala=5,4,4,4,4 --rating tashkeel350=5,4,4,4,4 "
f"--write-report {output_arg}\\tts-preprocessor-score.md "
f"--write-json {output_arg}\\tts-preprocessor-score.json"
),
"voiceListeningScore": (
"python scripts\\score_voice_listening.py "
"--rating silma-local=5,4,4,5,5 --rating espeak-ar-clear=3,2,4,3,5 "
f"--write-report {output_arg}\\voice-listening-score.md "
f"--write-json {output_arg}\\voice-listening-score.json"
),
"voicePromotionGate": (
"python scripts\\model_promotion_gate.py "
"--candidate-name \"External voice winner\" --kind tts --license Apache-2.0 "
f"--score-json {output_arg}\\voice-listening-score.json "
"--same-sample --runtime-ok --privacy-ok --human-reviewed "
f"--write-report {output_arg}\\voice-promotion-gate.md"
),
"preprocessorPromotionGate": (
"python scripts\\model_promotion_gate.py "
"--candidate-name \"TTS preprocessor winner\" --kind preprocessor --license Apache-2.0 "
f"--score-json {output_arg}\\tts-preprocessor-score.json "
"--same-sample --runtime-ok --privacy-ok --human-reviewed "
f"--write-report {output_arg}\\preprocessor-promotion-gate.md"
),
"omniVoiceExternal": (
"python -m omnivoice.cli "
f"--model k2-fsa/OmniVoice --language ar --text-file {text_arg} "
"--ref-audio C:\\path\\to\\arabic-reference.wav --ref-text \"Arabic reference transcript\""
),
"omniVoiceArabicLoraExternal": (
"Run OmniVoice with the Arabic LoRA adapter vivooglobal/omnivoice-lora-ar on the same text and "
"reference audio after the base OmniVoice benchmark works."
),
"tadaExternal": (
"Benchmark HumeAI/tada-3b-ml externally with language=\"ar\" on the same cleaned sample "
f"{text_arg}. It is designed to reduce off-script speech, but it reports the Llama 3.2 "
"license and is a 3B-class strong-worker path, so keep it outside the permissive default."
),
"lahgtnaChatterboxExternal": (
"python -m chatterbox.tts "
f"--model oddadmix/lahgtna-chatterbox-v1 --text-file {text_arg} "
"--audio-prompt-path C:\\path\\to\\arabic-reference.wav --repetition-penalty 1.25"
),
"namaaSaudiTtsExternal": (
"Use ChatterboxMultilingualTTS with the NAMAA-Space/NAMAA-Saudi-TTS safetensors on "
f"{text_arg}, language_id=\"ar\", and the same optional reference audio. Benchmark only for "
"Saudi/Gulf dialect fit, then compare against SILMA, Habibi, Saudi Arabic Qwen3-TTS, and Emirati voices."
),
"saudiChatterboxFineTuneExternal": (
"Use ChatterboxMultilingualTTS with FatimahEmadEldin/saudi-tts-chatterbox-finetuned T3 weights on "
f"{text_arg}, language_id=\"ar\", and the same optional reference audio. Benchmark only for "
"Saudi/Gulf dialect fit, then compare against NAMAA-Saudi-TTS, SILMA, Habibi, Saudi Arabic Qwen3-TTS, "
"and Emirati voices."
),
"nileTtsExternal": (
"Benchmark KickItLikeShika/NileTTS-XTTS only for Egyptian/dialectal Arabic using this same cleaned "
f"sample {text_arg}. It is Apache-2.0, but not an MSA book default."
),
"chatterboxMultilingualExternal": (
"Use ChatterboxMultilingualTTS.from_pretrained(...).generate(text, language_id=\"ar\", "
"audio_prompt_path=\"C:\\path\\to\\arabic-reference.wav\") against "
f"{text_arg}; compare pacing and pronunciation against SILMA and Habibi."
),
"chatterboxMultilingualOnnxExternal": (
"Benchmark onnx-community/chatterbox-multilingual-ONNX with language_id=\"ar\" against "
f"{text_arg}; compare CPU/ONNX runtime, repetition, pacing, and pronunciation against SILMA, "
"Habibi, and the regular Chatterbox-Multilingual path."
),
"ttsArabicOnnxExternal": (
"Benchmark nipponjo/tts-arabic-onnx with the same cleaned sample "
f"{text_arg}; try FastPitch, MixerTTS, speaker IDs, pace, and vowelizer options, then compare "
"CPU runtime and pronunciation against SILMA, Supertonic, MOSS-TTS-Nano, and Chatterbox ONNX. "
"Confirm model/repo licensing before production use."
),
"sparkTtsArabicExternal": (
"Spark-TTS Arabic requires the Spark-TTS repo plus diacritized Arabic/reference audio; benchmark it "
f"externally with {text_arg} only after preparing that reference workflow."
),
"sofeliaTtsExternal": (
"Sofelia-TTS is a Palestinian Arabic/MiraTTS voice-cloning path; benchmark it only for dialectal text "
f"using the same sample {text_arg}."
),
"arabicF5TtsCaution": (
"Arabic-F5-TTS-v2 is non-commercial and requires fully diacritized Arabic; keep it to personal "
"experiments unless that license and input requirement are acceptable."
),
"threeArabTtsExternal": (
"Benchmark sherif1313/3arab-TTS-500M-v1 and the VoiceDesign variant on this same cleaned Arabic "
f"sample: {text_arg}. It is Apache-2.0 and Arabic-only, but new enough that listenability and "
"long-form stability need manual checks before app wiring."
),
"voxcpm2External": f"Use {text_arg} as the exact Arabic text sample when testing VoxCPM2 externally.",
"voxtralTtsCaution": (
"Voxtral TTS supports Arabic on its model card, but it is CC-BY-NC-4.0 and GPU-heavy; "
f"use {text_arg} only for personal/non-commercial strong-worker listening comparisons."
),
"qwen3TtsCaution": (
"Do not promote Qwen3-TTS for this Arabic reader until an official Arabic-capable checkpoint "
"or Arabic fine-tune is verified on this same sample."
),
}
def write_markdown_report(path: Path, result: dict[str, Any]) -> None:
commands = result["commands"]
lines = [
"# External Arabic TTS Sample",
"",
f"PDF: `{result.get('pdf', '-')}`",
f"Text file: `{result['textPath']}`",
f"Characters: {result['characters']}",
f"Arabic words: {result['arabicWords']}",
f"OCR extraction: `{result.get('extraction', '-')}`",
f"Quality: `{result['quality']}`",
"",
"Use this same cleaned Arabic text for every voice/model comparison. Do not compare voices with different OCR text.",
"",
"## Commands",
"",
"Local installed voices:",
"",
f"```powershell\n{commands['localVoiceBenchmark']}\n```",
"",
"MOSS-TTS-Nano ONNX external benchmark:",
"",
f"```powershell\n{commands['mossTtsNanoOnnx']}\n```",
"",
"MOSS-TTS-Nano local server:",
"",
f"```powershell\n{commands['mossTtsNanoServer']}\n```",
"",
"Supertonic 3 local CPU benchmark:",
"",
f"```powershell\n{commands['supertonicLocal']}\n```",
"",
"Mishkala Tashkeel pronunciation preprocessor:",
"",
f"```text\n{commands['mishkalaTashkeelExternal']}\n```",
"",
"Mishkala local voice benchmark:",
"",
f"```powershell\n{commands['mishkalaVoiceBenchmark']}\n```",
"",
"Tashkeel-350M pronunciation preprocessor:",
"",
f"```text\n{commands['tashkeel350External']}\n```",
"",
"Tashkeel-350M local voice benchmark:",
"",
f"```powershell\n{commands['tashkeel350VoiceBenchmark']}\n```",
"",
"Plain vs Mishkala vs Tashkeel-350M listening score:",
"",
f"```powershell\n{commands['preprocessorListeningScore']}\n```",
"",
"Preprocessor promotion gate:",
"",
f"```powershell\n{commands['preprocessorPromotionGate']}\n```",
"",
"Voice listening score:",
"",
f"```powershell\n{commands['voiceListeningScore']}\n```",
"",
"Voice promotion gate:",
"",
f"```powershell\n{commands['voicePromotionGate']}\n```",
"",
"OmniVoice external benchmark:",
"",
f"```powershell\n{commands['omniVoiceExternal']}\n```",
"",
"OmniVoice Arabic LoRA external benchmark:",
"",
f"```text\n{commands['omniVoiceArabicLoraExternal']}\n```",
"",
"TADA multilingual external benchmark:",
"",
f"```text\n{commands['tadaExternal']}\n```",
"",
"Lahgtna Chatterbox external benchmark:",
"",
f"```powershell\n{commands['lahgtnaChatterboxExternal']}\n```",
"",
"NAMAA-Saudi-TTS external benchmark:",
"",
f"```text\n{commands['namaaSaudiTtsExternal']}\n```",
"",
"Saudi Chatterbox fine-tune external benchmark:",
"",
f"```text\n{commands['saudiChatterboxFineTuneExternal']}\n```",
"",
"NileTTS-XTTS Egyptian Arabic benchmark:",
"",
f"```text\n{commands['nileTtsExternal']}\n```",
"",
"Chatterbox-Multilingual external benchmark:",
"",
f"```text\n{commands['chatterboxMultilingualExternal']}\n```",
"",
"Chatterbox-Multilingual ONNX external benchmark:",
"",
f"```text\n{commands['chatterboxMultilingualOnnxExternal']}\n```",
"",
"tts-arabic-onnx external benchmark:",
"",
f"```text\n{commands['ttsArabicOnnxExternal']}\n```",
"",
"Spark-TTS Arabic external benchmark:",
"",
f"```text\n{commands['sparkTtsArabicExternal']}\n```",
"",
"Sofelia-TTS external benchmark:",
"",
f"```text\n{commands['sofeliaTtsExternal']}\n```",
"",
"Arabic-F5-TTS-v2 caution:",
"",
f"```text\n{commands['arabicF5TtsCaution']}\n```",
"",
"3arab-TTS 500M external benchmark:",
"",
f"```text\n{commands['threeArabTtsExternal']}\n```",
"",
"VoxCPM2 external benchmark:",
"",
f"```text\n{commands['voxcpm2External']}\n```",
"",
"Voxtral TTS caution:",
"",
f"```text\n{commands['voxtralTtsCaution']}\n```",
"",
"Qwen3-TTS caution:",
"",
f"```text\n{commands['qwen3TtsCaution']}\n```",
"",
"## Listening Checklist",
"",
"- Arabic pronunciation is clear and not robotic.",
"- Pauses are comfortable for long book passages.",
"- Numbers, Quranic symbols, and punctuation are not read strangely.",
"- Runtime is acceptable before processing a full book.",
"- Replace placeholder candidate names and licenses in promotion-gate commands before changing the production default.",
]
path.parent.mkdir(parents=True, exist_ok=True)
path.write_text("\n".join(lines) + "\n", encoding="utf-8")
def export_tts_sample(
pdf_path: Path,
out_dir: Path = ROOT_DIR / "outputs" / "external-tts-sample",
max_chars: int = 1200,
chunk_size: int = main.CLOUD_TTS_MAX_CHARS,
ocr_engine: str | None = None,
from_extraction: str | None = None,
env_file: Path | None = None,
write_report: bool = True,
) -> dict[str, Any]:
dry_run = dry_run_pdf(
pdf_path,
chunk_size=chunk_size,
ocr_engine=ocr_engine,
from_extraction=from_extraction,
env_file=env_file,
include_speech_text=True,
speech_sample_chars=max_chars,
)
if not dry_run["readyForTts"]:
reasons = "; ".join(str(reason) for reason in dry_run.get("qualityReasons", []))
raise RuntimeError(f"OCR text is not ready for TTS. {reasons}".strip())
sample_text = str(dry_run["speechSampleText"]).strip()
out_dir.mkdir(parents=True, exist_ok=True)
text_path = out_dir / "arabic-tts-sample.txt"
text_path.write_text(sample_text + "\n", encoding="utf-8")
commands = build_external_commands(text_path, out_dir)
result: dict[str, Any] = {
"ready": True,
"pdf": str(pdf_path),
"textPath": str(text_path),
"reportPath": str(out_dir / "external-tts-sample.md"),
"characters": len(sample_text),
"fullSpeechCharacters": dry_run["speechCharacters"],
"arabicWords": dry_run["arabicWords"],
"quality": dry_run["quality"],
"qualityScore": dry_run["qualityScore"],
"qualityReasons": dry_run["qualityReasons"],
"ocrEngine": dry_run["ocrEngine"],
"extraction": dry_run["extraction"],
"commands": commands,
}
if write_report:
write_markdown_report(out_dir / "external-tts-sample.md", result)
return result
def main_cli() -> None:
if hasattr(sys.stdout, "reconfigure"):
sys.stdout.reconfigure(encoding="utf-8", errors="replace")
parser = argparse.ArgumentParser(description="Export the same cleaned Arabic text sample for external TTS benchmarking.")
parser.add_argument("pdf", type=Path, help="Arabic PDF to extract a cleaned speech sample from.")
parser.add_argument("--out-dir", type=Path, default=ROOT_DIR / "outputs" / "external-tts-sample")
parser.add_argument("--max-chars", type=int, default=1200, help="Maximum cleaned characters to export.")
parser.add_argument("--chunk-size", type=int, default=main.CLOUD_TTS_MAX_CHARS)
parser.add_argument("--ocr-engine", choices=sorted(main.OCR_ENGINE_CHOICES), help="OCR engine to test.")
parser.add_argument("--from-extraction", help="Apply settings from a benchmark extraction label.")
parser.add_argument("--env-file", type=Path, help="Load OCR settings from a generated OCR .env snippet.")
parser.add_argument("--no-report", action="store_true", help="Only write the text file.")
parser.add_argument("--json", action="store_true", help="Print JSON instead of a compact summary.")
args = parser.parse_args()
result = export_tts_sample(
args.pdf,
out_dir=args.out_dir,
max_chars=args.max_chars,
chunk_size=args.chunk_size,
ocr_engine=args.ocr_engine,
from_extraction=args.from_extraction,
env_file=args.env_file,
write_report=not args.no_report,
)
if args.json:
print(json.dumps(result, ensure_ascii=False, indent=2))
else:
print(f"Wrote Arabic TTS sample: {result['textPath']}")
if not args.no_report:
print(f"Wrote benchmark handoff: {result['reportPath']}")
if __name__ == "__main__":
main_cli()
|