File size: 17,512 Bytes
2e1a095
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
from __future__ import annotations

import argparse
import json
import sys
from pathlib import Path
from typing import Any

ROOT_DIR = Path(__file__).resolve().parent.parent
if str(ROOT_DIR) not in sys.path:
    sys.path.insert(0, str(ROOT_DIR))

from app import main
from scripts.dry_run_pdf import dry_run_pdf


def safe_command_path(path: Path) -> str:
    text = str(path)
    return f'"{text}"' if " " in text else text


def build_external_commands(text_path: Path, output_dir: Path) -> dict[str, str]:
    text_arg = safe_command_path(text_path)
    output_arg = safe_command_path(output_dir)
    return {
        "localVoiceBenchmark": (
            f"python scripts\\benchmark_voices.py --text-file {text_arg} "
            f"--out-dir {output_arg}\\local-voices --write-report {output_arg}\\local-voices.md"
        ),
        "mossTtsNanoOnnx": (
            "moss-tts-nano generate --backend onnx --language ar "
            f"--text-file {text_arg} --prompt-speech C:\\path\\to\\arabic-reference.wav"
        ),
        "mossTtsNanoServer": "moss-tts-nano serve --backend onnx",
        "supertonicLocal": (
            f"python scripts\\benchmark_voices.py --voices supertonic-ar --text-file {text_arg} "
            f"--out-dir {output_arg}\\supertonic --write-report {output_arg}\\supertonic.md"
        ),
        "mishkalaTashkeelExternal": (
            "Diacritize the same cleaned sample with flokymind/mishkala and save it beside the plain sample, "
            f"for example {output_arg}\\arabic-tts-sample-mishkala.txt. Then synthesize both files with the same voice."
        ),
        "mishkalaVoiceBenchmark": (
            f"python scripts\\benchmark_voices.py --text-file {output_arg}\\arabic-tts-sample-mishkala.txt "
            f"--out-dir {output_arg}\\mishkala-local-voices --write-report {output_arg}\\mishkala-local-voices.md"
        ),
        "tashkeel350External": (
            "Diacritize the same cleaned sample with Etherll/Tashkeel-350M and save it beside the plain sample, "
            f"for example {output_arg}\\arabic-tts-sample-tashkeel350.txt. Then synthesize plain, Mishkala, "
            "and Tashkeel-350M samples with the same voice before choosing a preprocessor."
        ),
        "tashkeel350VoiceBenchmark": (
            f"python scripts\\benchmark_voices.py --text-file {output_arg}\\arabic-tts-sample-tashkeel350.txt "
            f"--out-dir {output_arg}\\tashkeel350-local-voices --write-report {output_arg}\\tashkeel350-local-voices.md"
        ),
        "preprocessorListeningScore": (
            "python scripts\\score_tts_preprocessor.py "
            "--rating plain=4,5,5,4,4 --rating mishkala=5,4,4,4,4 --rating tashkeel350=5,4,4,4,4 "
            f"--write-report {output_arg}\\tts-preprocessor-score.md "
            f"--write-json {output_arg}\\tts-preprocessor-score.json"
        ),
        "voiceListeningScore": (
            "python scripts\\score_voice_listening.py "
            "--rating silma-local=5,4,4,5,5 --rating espeak-ar-clear=3,2,4,3,5 "
            f"--write-report {output_arg}\\voice-listening-score.md "
            f"--write-json {output_arg}\\voice-listening-score.json"
        ),
        "voicePromotionGate": (
            "python scripts\\model_promotion_gate.py "
            "--candidate-name \"External voice winner\" --kind tts --license Apache-2.0 "
            f"--score-json {output_arg}\\voice-listening-score.json "
            "--same-sample --runtime-ok --privacy-ok --human-reviewed "
            f"--write-report {output_arg}\\voice-promotion-gate.md"
        ),
        "preprocessorPromotionGate": (
            "python scripts\\model_promotion_gate.py "
            "--candidate-name \"TTS preprocessor winner\" --kind preprocessor --license Apache-2.0 "
            f"--score-json {output_arg}\\tts-preprocessor-score.json "
            "--same-sample --runtime-ok --privacy-ok --human-reviewed "
            f"--write-report {output_arg}\\preprocessor-promotion-gate.md"
        ),
        "omniVoiceExternal": (
            "python -m omnivoice.cli "
            f"--model k2-fsa/OmniVoice --language ar --text-file {text_arg} "
            "--ref-audio C:\\path\\to\\arabic-reference.wav --ref-text \"Arabic reference transcript\""
        ),
        "omniVoiceArabicLoraExternal": (
            "Run OmniVoice with the Arabic LoRA adapter vivooglobal/omnivoice-lora-ar on the same text and "
            "reference audio after the base OmniVoice benchmark works."
        ),
        "tadaExternal": (
            "Benchmark HumeAI/tada-3b-ml externally with language=\"ar\" on the same cleaned sample "
            f"{text_arg}. It is designed to reduce off-script speech, but it reports the Llama 3.2 "
            "license and is a 3B-class strong-worker path, so keep it outside the permissive default."
        ),
        "lahgtnaChatterboxExternal": (
            "python -m chatterbox.tts "
            f"--model oddadmix/lahgtna-chatterbox-v1 --text-file {text_arg} "
            "--audio-prompt-path C:\\path\\to\\arabic-reference.wav --repetition-penalty 1.25"
        ),
        "namaaSaudiTtsExternal": (
            "Use ChatterboxMultilingualTTS with the NAMAA-Space/NAMAA-Saudi-TTS safetensors on "
            f"{text_arg}, language_id=\"ar\", and the same optional reference audio. Benchmark only for "
            "Saudi/Gulf dialect fit, then compare against SILMA, Habibi, Saudi Arabic Qwen3-TTS, and Emirati voices."
        ),
        "saudiChatterboxFineTuneExternal": (
            "Use ChatterboxMultilingualTTS with FatimahEmadEldin/saudi-tts-chatterbox-finetuned T3 weights on "
            f"{text_arg}, language_id=\"ar\", and the same optional reference audio. Benchmark only for "
            "Saudi/Gulf dialect fit, then compare against NAMAA-Saudi-TTS, SILMA, Habibi, Saudi Arabic Qwen3-TTS, "
            "and Emirati voices."
        ),
        "nileTtsExternal": (
            "Benchmark KickItLikeShika/NileTTS-XTTS only for Egyptian/dialectal Arabic using this same cleaned "
            f"sample {text_arg}. It is Apache-2.0, but not an MSA book default."
        ),
        "chatterboxMultilingualExternal": (
            "Use ChatterboxMultilingualTTS.from_pretrained(...).generate(text, language_id=\"ar\", "
            "audio_prompt_path=\"C:\\path\\to\\arabic-reference.wav\") against "
            f"{text_arg}; compare pacing and pronunciation against SILMA and Habibi."
        ),
        "chatterboxMultilingualOnnxExternal": (
            "Benchmark onnx-community/chatterbox-multilingual-ONNX with language_id=\"ar\" against "
            f"{text_arg}; compare CPU/ONNX runtime, repetition, pacing, and pronunciation against SILMA, "
            "Habibi, and the regular Chatterbox-Multilingual path."
        ),
        "ttsArabicOnnxExternal": (
            "Benchmark nipponjo/tts-arabic-onnx with the same cleaned sample "
            f"{text_arg}; try FastPitch, MixerTTS, speaker IDs, pace, and vowelizer options, then compare "
            "CPU runtime and pronunciation against SILMA, Supertonic, MOSS-TTS-Nano, and Chatterbox ONNX. "
            "Confirm model/repo licensing before production use."
        ),
        "sparkTtsArabicExternal": (
            "Spark-TTS Arabic requires the Spark-TTS repo plus diacritized Arabic/reference audio; benchmark it "
            f"externally with {text_arg} only after preparing that reference workflow."
        ),
        "sofeliaTtsExternal": (
            "Sofelia-TTS is a Palestinian Arabic/MiraTTS voice-cloning path; benchmark it only for dialectal text "
            f"using the same sample {text_arg}."
        ),
        "arabicF5TtsCaution": (
            "Arabic-F5-TTS-v2 is non-commercial and requires fully diacritized Arabic; keep it to personal "
            "experiments unless that license and input requirement are acceptable."
        ),
        "threeArabTtsExternal": (
            "Benchmark sherif1313/3arab-TTS-500M-v1 and the VoiceDesign variant on this same cleaned Arabic "
            f"sample: {text_arg}. It is Apache-2.0 and Arabic-only, but new enough that listenability and "
            "long-form stability need manual checks before app wiring."
        ),
        "voxcpm2External": f"Use {text_arg} as the exact Arabic text sample when testing VoxCPM2 externally.",
        "voxtralTtsCaution": (
            "Voxtral TTS supports Arabic on its model card, but it is CC-BY-NC-4.0 and GPU-heavy; "
            f"use {text_arg} only for personal/non-commercial strong-worker listening comparisons."
        ),
        "qwen3TtsCaution": (
            "Do not promote Qwen3-TTS for this Arabic reader until an official Arabic-capable checkpoint "
            "or Arabic fine-tune is verified on this same sample."
        ),
    }


def write_markdown_report(path: Path, result: dict[str, Any]) -> None:
    commands = result["commands"]
    lines = [
        "# External Arabic TTS Sample",
        "",
        f"PDF: `{result.get('pdf', '-')}`",
        f"Text file: `{result['textPath']}`",
        f"Characters: {result['characters']}",
        f"Arabic words: {result['arabicWords']}",
        f"OCR extraction: `{result.get('extraction', '-')}`",
        f"Quality: `{result['quality']}`",
        "",
        "Use this same cleaned Arabic text for every voice/model comparison. Do not compare voices with different OCR text.",
        "",
        "## Commands",
        "",
        "Local installed voices:",
        "",
        f"```powershell\n{commands['localVoiceBenchmark']}\n```",
        "",
        "MOSS-TTS-Nano ONNX external benchmark:",
        "",
        f"```powershell\n{commands['mossTtsNanoOnnx']}\n```",
        "",
        "MOSS-TTS-Nano local server:",
        "",
        f"```powershell\n{commands['mossTtsNanoServer']}\n```",
        "",
        "Supertonic 3 local CPU benchmark:",
        "",
        f"```powershell\n{commands['supertonicLocal']}\n```",
        "",
        "Mishkala Tashkeel pronunciation preprocessor:",
        "",
        f"```text\n{commands['mishkalaTashkeelExternal']}\n```",
        "",
        "Mishkala local voice benchmark:",
        "",
        f"```powershell\n{commands['mishkalaVoiceBenchmark']}\n```",
        "",
        "Tashkeel-350M pronunciation preprocessor:",
        "",
        f"```text\n{commands['tashkeel350External']}\n```",
        "",
        "Tashkeel-350M local voice benchmark:",
        "",
        f"```powershell\n{commands['tashkeel350VoiceBenchmark']}\n```",
        "",
        "Plain vs Mishkala vs Tashkeel-350M listening score:",
        "",
        f"```powershell\n{commands['preprocessorListeningScore']}\n```",
        "",
        "Preprocessor promotion gate:",
        "",
        f"```powershell\n{commands['preprocessorPromotionGate']}\n```",
        "",
        "Voice listening score:",
        "",
        f"```powershell\n{commands['voiceListeningScore']}\n```",
        "",
        "Voice promotion gate:",
        "",
        f"```powershell\n{commands['voicePromotionGate']}\n```",
        "",
        "OmniVoice external benchmark:",
        "",
        f"```powershell\n{commands['omniVoiceExternal']}\n```",
        "",
        "OmniVoice Arabic LoRA external benchmark:",
        "",
        f"```text\n{commands['omniVoiceArabicLoraExternal']}\n```",
        "",
        "TADA multilingual external benchmark:",
        "",
        f"```text\n{commands['tadaExternal']}\n```",
        "",
        "Lahgtna Chatterbox external benchmark:",
        "",
        f"```powershell\n{commands['lahgtnaChatterboxExternal']}\n```",
        "",
        "NAMAA-Saudi-TTS external benchmark:",
        "",
        f"```text\n{commands['namaaSaudiTtsExternal']}\n```",
        "",
        "Saudi Chatterbox fine-tune external benchmark:",
        "",
        f"```text\n{commands['saudiChatterboxFineTuneExternal']}\n```",
        "",
        "NileTTS-XTTS Egyptian Arabic benchmark:",
        "",
        f"```text\n{commands['nileTtsExternal']}\n```",
        "",
        "Chatterbox-Multilingual external benchmark:",
        "",
        f"```text\n{commands['chatterboxMultilingualExternal']}\n```",
        "",
        "Chatterbox-Multilingual ONNX external benchmark:",
        "",
        f"```text\n{commands['chatterboxMultilingualOnnxExternal']}\n```",
        "",
        "tts-arabic-onnx external benchmark:",
        "",
        f"```text\n{commands['ttsArabicOnnxExternal']}\n```",
        "",
        "Spark-TTS Arabic external benchmark:",
        "",
        f"```text\n{commands['sparkTtsArabicExternal']}\n```",
        "",
        "Sofelia-TTS external benchmark:",
        "",
        f"```text\n{commands['sofeliaTtsExternal']}\n```",
        "",
        "Arabic-F5-TTS-v2 caution:",
        "",
        f"```text\n{commands['arabicF5TtsCaution']}\n```",
        "",
        "3arab-TTS 500M external benchmark:",
        "",
        f"```text\n{commands['threeArabTtsExternal']}\n```",
        "",
        "VoxCPM2 external benchmark:",
        "",
        f"```text\n{commands['voxcpm2External']}\n```",
        "",
        "Voxtral TTS caution:",
        "",
        f"```text\n{commands['voxtralTtsCaution']}\n```",
        "",
        "Qwen3-TTS caution:",
        "",
        f"```text\n{commands['qwen3TtsCaution']}\n```",
        "",
        "## Listening Checklist",
        "",
        "- Arabic pronunciation is clear and not robotic.",
        "- Pauses are comfortable for long book passages.",
        "- Numbers, Quranic symbols, and punctuation are not read strangely.",
        "- Runtime is acceptable before processing a full book.",
        "- Replace placeholder candidate names and licenses in promotion-gate commands before changing the production default.",
    ]
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text("\n".join(lines) + "\n", encoding="utf-8")


def export_tts_sample(
    pdf_path: Path,
    out_dir: Path = ROOT_DIR / "outputs" / "external-tts-sample",
    max_chars: int = 1200,
    chunk_size: int = main.CLOUD_TTS_MAX_CHARS,
    ocr_engine: str | None = None,
    from_extraction: str | None = None,
    env_file: Path | None = None,
    write_report: bool = True,
) -> dict[str, Any]:
    dry_run = dry_run_pdf(
        pdf_path,
        chunk_size=chunk_size,
        ocr_engine=ocr_engine,
        from_extraction=from_extraction,
        env_file=env_file,
        include_speech_text=True,
        speech_sample_chars=max_chars,
    )
    if not dry_run["readyForTts"]:
        reasons = "; ".join(str(reason) for reason in dry_run.get("qualityReasons", []))
        raise RuntimeError(f"OCR text is not ready for TTS. {reasons}".strip())

    sample_text = str(dry_run["speechSampleText"]).strip()
    out_dir.mkdir(parents=True, exist_ok=True)
    text_path = out_dir / "arabic-tts-sample.txt"
    text_path.write_text(sample_text + "\n", encoding="utf-8")
    commands = build_external_commands(text_path, out_dir)
    result: dict[str, Any] = {
        "ready": True,
        "pdf": str(pdf_path),
        "textPath": str(text_path),
        "reportPath": str(out_dir / "external-tts-sample.md"),
        "characters": len(sample_text),
        "fullSpeechCharacters": dry_run["speechCharacters"],
        "arabicWords": dry_run["arabicWords"],
        "quality": dry_run["quality"],
        "qualityScore": dry_run["qualityScore"],
        "qualityReasons": dry_run["qualityReasons"],
        "ocrEngine": dry_run["ocrEngine"],
        "extraction": dry_run["extraction"],
        "commands": commands,
    }
    if write_report:
        write_markdown_report(out_dir / "external-tts-sample.md", result)
    return result


def main_cli() -> None:
    if hasattr(sys.stdout, "reconfigure"):
        sys.stdout.reconfigure(encoding="utf-8", errors="replace")
    parser = argparse.ArgumentParser(description="Export the same cleaned Arabic text sample for external TTS benchmarking.")
    parser.add_argument("pdf", type=Path, help="Arabic PDF to extract a cleaned speech sample from.")
    parser.add_argument("--out-dir", type=Path, default=ROOT_DIR / "outputs" / "external-tts-sample")
    parser.add_argument("--max-chars", type=int, default=1200, help="Maximum cleaned characters to export.")
    parser.add_argument("--chunk-size", type=int, default=main.CLOUD_TTS_MAX_CHARS)
    parser.add_argument("--ocr-engine", choices=sorted(main.OCR_ENGINE_CHOICES), help="OCR engine to test.")
    parser.add_argument("--from-extraction", help="Apply settings from a benchmark extraction label.")
    parser.add_argument("--env-file", type=Path, help="Load OCR settings from a generated OCR .env snippet.")
    parser.add_argument("--no-report", action="store_true", help="Only write the text file.")
    parser.add_argument("--json", action="store_true", help="Print JSON instead of a compact summary.")
    args = parser.parse_args()

    result = export_tts_sample(
        args.pdf,
        out_dir=args.out_dir,
        max_chars=args.max_chars,
        chunk_size=args.chunk_size,
        ocr_engine=args.ocr_engine,
        from_extraction=args.from_extraction,
        env_file=args.env_file,
        write_report=not args.no_report,
    )
    if args.json:
        print(json.dumps(result, ensure_ascii=False, indent=2))
    else:
        print(f"Wrote Arabic TTS sample: {result['textPath']}")
        if not args.no_report:
            print(f"Wrote benchmark handoff: {result['reportPath']}")


if __name__ == "__main__":
    main_cli()