Spaces:

Syncre
/

arabic-audio-reader-worker

Running

App Files Files Community

arabic-audio-reader-worker / scripts /export_tts_sample.py

Syncre

Deploy Arabic Audio Reader worker

2e1a095 verified 1 day ago

raw

history blame contribute delete

17.5 kB

	from __future__ import annotations

	import argparse
	import json
	import sys
	from pathlib import Path
	from typing import Any

	ROOT_DIR = Path(__file__).resolve().parent.parent
	if str(ROOT_DIR) not in sys.path:
	sys.path.insert(0, str(ROOT_DIR))

	from app import main
	from scripts.dry_run_pdf import dry_run_pdf


	def safe_command_path(path: Path) -> str:
	text = str(path)
	return f'"{text}"' if " " in text else text


	def build_external_commands(text_path: Path, output_dir: Path) -> dict[str, str]:
	text_arg = safe_command_path(text_path)
	output_arg = safe_command_path(output_dir)
	return {
	"localVoiceBenchmark": (
	f"python scripts\\benchmark_voices.py --text-file {text_arg} "
	f"--out-dir {output_arg}\\local-voices --write-report {output_arg}\\local-voices.md"
	),
	"mossTtsNanoOnnx": (
	"moss-tts-nano generate --backend onnx --language ar "
	f"--text-file {text_arg} --prompt-speech C:\\path\\to\\arabic-reference.wav"
	),
	"mossTtsNanoServer": "moss-tts-nano serve --backend onnx",
	"supertonicLocal": (
	f"python scripts\\benchmark_voices.py --voices supertonic-ar --text-file {text_arg} "
	f"--out-dir {output_arg}\\supertonic --write-report {output_arg}\\supertonic.md"
	),
	"mishkalaTashkeelExternal": (
	"Diacritize the same cleaned sample with flokymind/mishkala and save it beside the plain sample, "
	f"for example {output_arg}\\arabic-tts-sample-mishkala.txt. Then synthesize both files with the same voice."
	),
	"mishkalaVoiceBenchmark": (
	f"python scripts\\benchmark_voices.py --text-file {output_arg}\\arabic-tts-sample-mishkala.txt "
	f"--out-dir {output_arg}\\mishkala-local-voices --write-report {output_arg}\\mishkala-local-voices.md"
	),
	"tashkeel350External": (
	"Diacritize the same cleaned sample with Etherll/Tashkeel-350M and save it beside the plain sample, "
	f"for example {output_arg}\\arabic-tts-sample-tashkeel350.txt. Then synthesize plain, Mishkala, "
	"and Tashkeel-350M samples with the same voice before choosing a preprocessor."
	),
	"tashkeel350VoiceBenchmark": (
	f"python scripts\\benchmark_voices.py --text-file {output_arg}\\arabic-tts-sample-tashkeel350.txt "
	f"--out-dir {output_arg}\\tashkeel350-local-voices --write-report {output_arg}\\tashkeel350-local-voices.md"
	),
	"preprocessorListeningScore": (
	"python scripts\\score_tts_preprocessor.py "
	"--rating plain=4,5,5,4,4 --rating mishkala=5,4,4,4,4 --rating tashkeel350=5,4,4,4,4 "
	f"--write-report {output_arg}\\tts-preprocessor-score.md "
	f"--write-json {output_arg}\\tts-preprocessor-score.json"
	),
	"voiceListeningScore": (
	"python scripts\\score_voice_listening.py "
	"--rating silma-local=5,4,4,5,5 --rating espeak-ar-clear=3,2,4,3,5 "
	f"--write-report {output_arg}\\voice-listening-score.md "
	f"--write-json {output_arg}\\voice-listening-score.json"
	),
	"voicePromotionGate": (
	"python scripts\\model_promotion_gate.py "
	"--candidate-name \"External voice winner\" --kind tts --license Apache-2.0 "
	f"--score-json {output_arg}\\voice-listening-score.json "
	"--same-sample --runtime-ok --privacy-ok --human-reviewed "
	f"--write-report {output_arg}\\voice-promotion-gate.md"
	),
	"preprocessorPromotionGate": (
	"python scripts\\model_promotion_gate.py "
	"--candidate-name \"TTS preprocessor winner\" --kind preprocessor --license Apache-2.0 "
	f"--score-json {output_arg}\\tts-preprocessor-score.json "
	"--same-sample --runtime-ok --privacy-ok --human-reviewed "
	f"--write-report {output_arg}\\preprocessor-promotion-gate.md"
	),
	"omniVoiceExternal": (
	"python -m omnivoice.cli "
	f"--model k2-fsa/OmniVoice --language ar --text-file {text_arg} "
	"--ref-audio C:\\path\\to\\arabic-reference.wav --ref-text \"Arabic reference transcript\""
	),
	"omniVoiceArabicLoraExternal": (
	"Run OmniVoice with the Arabic LoRA adapter vivooglobal/omnivoice-lora-ar on the same text and "
	"reference audio after the base OmniVoice benchmark works."
	),
	"tadaExternal": (
	"Benchmark HumeAI/tada-3b-ml externally with language=\"ar\" on the same cleaned sample "
	f"{text_arg}. It is designed to reduce off-script speech, but it reports the Llama 3.2 "
	"license and is a 3B-class strong-worker path, so keep it outside the permissive default."
	),
	"lahgtnaChatterboxExternal": (
	"python -m chatterbox.tts "
	f"--model oddadmix/lahgtna-chatterbox-v1 --text-file {text_arg} "
	"--audio-prompt-path C:\\path\\to\\arabic-reference.wav --repetition-penalty 1.25"
	),
	"namaaSaudiTtsExternal": (
	"Use ChatterboxMultilingualTTS with the NAMAA-Space/NAMAA-Saudi-TTS safetensors on "
	f"{text_arg}, language_id=\"ar\", and the same optional reference audio. Benchmark only for "
	"Saudi/Gulf dialect fit, then compare against SILMA, Habibi, Saudi Arabic Qwen3-TTS, and Emirati voices."
	),
	"saudiChatterboxFineTuneExternal": (
	"Use ChatterboxMultilingualTTS with FatimahEmadEldin/saudi-tts-chatterbox-finetuned T3 weights on "
	f"{text_arg}, language_id=\"ar\", and the same optional reference audio. Benchmark only for "
	"Saudi/Gulf dialect fit, then compare against NAMAA-Saudi-TTS, SILMA, Habibi, Saudi Arabic Qwen3-TTS, "
	"and Emirati voices."
	),
	"nileTtsExternal": (
	"Benchmark KickItLikeShika/NileTTS-XTTS only for Egyptian/dialectal Arabic using this same cleaned "
	f"sample {text_arg}. It is Apache-2.0, but not an MSA book default."
	),
	"chatterboxMultilingualExternal": (
	"Use ChatterboxMultilingualTTS.from_pretrained(...).generate(text, language_id=\"ar\", "
	"audio_prompt_path=\"C:\\path\\to\\arabic-reference.wav\") against "
	f"{text_arg}; compare pacing and pronunciation against SILMA and Habibi."
	),
	"chatterboxMultilingualOnnxExternal": (
	"Benchmark onnx-community/chatterbox-multilingual-ONNX with language_id=\"ar\" against "
	f"{text_arg}; compare CPU/ONNX runtime, repetition, pacing, and pronunciation against SILMA, "
	"Habibi, and the regular Chatterbox-Multilingual path."
	),
	"ttsArabicOnnxExternal": (
	"Benchmark nipponjo/tts-arabic-onnx with the same cleaned sample "
	f"{text_arg}; try FastPitch, MixerTTS, speaker IDs, pace, and vowelizer options, then compare "
	"CPU runtime and pronunciation against SILMA, Supertonic, MOSS-TTS-Nano, and Chatterbox ONNX. "
	"Confirm model/repo licensing before production use."
	),
	"sparkTtsArabicExternal": (
	"Spark-TTS Arabic requires the Spark-TTS repo plus diacritized Arabic/reference audio; benchmark it "
	f"externally with {text_arg} only after preparing that reference workflow."
	),
	"sofeliaTtsExternal": (
	"Sofelia-TTS is a Palestinian Arabic/MiraTTS voice-cloning path; benchmark it only for dialectal text "
	f"using the same sample {text_arg}."
	),
	"arabicF5TtsCaution": (
	"Arabic-F5-TTS-v2 is non-commercial and requires fully diacritized Arabic; keep it to personal "
	"experiments unless that license and input requirement are acceptable."
	),
	"threeArabTtsExternal": (
	"Benchmark sherif1313/3arab-TTS-500M-v1 and the VoiceDesign variant on this same cleaned Arabic "
	f"sample: {text_arg}. It is Apache-2.0 and Arabic-only, but new enough that listenability and "
	"long-form stability need manual checks before app wiring."
	),
	"voxcpm2External": f"Use {text_arg} as the exact Arabic text sample when testing VoxCPM2 externally.",
	"voxtralTtsCaution": (
	"Voxtral TTS supports Arabic on its model card, but it is CC-BY-NC-4.0 and GPU-heavy; "
	f"use {text_arg} only for personal/non-commercial strong-worker listening comparisons."
	),
	"qwen3TtsCaution": (
	"Do not promote Qwen3-TTS for this Arabic reader until an official Arabic-capable checkpoint "
	"or Arabic fine-tune is verified on this same sample."
	),
	}


	def write_markdown_report(path: Path, result: dict[str, Any]) -> None:
	commands = result["commands"]
	lines = [
	"# External Arabic TTS Sample",
	"",
	f"PDF: `{result.get('pdf', '-')}`",
	f"Text file: `{result['textPath']}`",
	f"Characters: {result['characters']}",
	f"Arabic words: {result['arabicWords']}",
	f"OCR extraction: `{result.get('extraction', '-')}`",
	f"Quality: `{result['quality']}`",
	"",
	"Use this same cleaned Arabic text for every voice/model comparison. Do not compare voices with different OCR text.",
	"",
	"## Commands",
	"",
	"Local installed voices:",
	"",
	f"```powershell\n{commands['localVoiceBenchmark']}\n```",
	"",
	"MOSS-TTS-Nano ONNX external benchmark:",
	"",
	f"```powershell\n{commands['mossTtsNanoOnnx']}\n```",
	"",
	"MOSS-TTS-Nano local server:",
	"",
	f"```powershell\n{commands['mossTtsNanoServer']}\n```",
	"",
	"Supertonic 3 local CPU benchmark:",
	"",
	f"```powershell\n{commands['supertonicLocal']}\n```",
	"",
	"Mishkala Tashkeel pronunciation preprocessor:",
	"",
	f"```text\n{commands['mishkalaTashkeelExternal']}\n```",
	"",
	"Mishkala local voice benchmark:",
	"",
	f"```powershell\n{commands['mishkalaVoiceBenchmark']}\n```",
	"",
	"Tashkeel-350M pronunciation preprocessor:",
	"",
	f"```text\n{commands['tashkeel350External']}\n```",
	"",
	"Tashkeel-350M local voice benchmark:",
	"",
	f"```powershell\n{commands['tashkeel350VoiceBenchmark']}\n```",
	"",
	"Plain vs Mishkala vs Tashkeel-350M listening score:",
	"",
	f"```powershell\n{commands['preprocessorListeningScore']}\n```",
	"",
	"Preprocessor promotion gate:",
	"",
	f"```powershell\n{commands['preprocessorPromotionGate']}\n```",
	"",
	"Voice listening score:",
	"",
	f"```powershell\n{commands['voiceListeningScore']}\n```",
	"",
	"Voice promotion gate:",
	"",
	f"```powershell\n{commands['voicePromotionGate']}\n```",
	"",
	"OmniVoice external benchmark:",
	"",
	f"```powershell\n{commands['omniVoiceExternal']}\n```",
	"",
	"OmniVoice Arabic LoRA external benchmark:",
	"",
	f"```text\n{commands['omniVoiceArabicLoraExternal']}\n```",
	"",
	"TADA multilingual external benchmark:",
	"",
	f"```text\n{commands['tadaExternal']}\n```",
	"",
	"Lahgtna Chatterbox external benchmark:",
	"",
	f"```powershell\n{commands['lahgtnaChatterboxExternal']}\n```",
	"",
	"NAMAA-Saudi-TTS external benchmark:",
	"",
	f"```text\n{commands['namaaSaudiTtsExternal']}\n```",
	"",
	"Saudi Chatterbox fine-tune external benchmark:",
	"",
	f"```text\n{commands['saudiChatterboxFineTuneExternal']}\n```",
	"",
	"NileTTS-XTTS Egyptian Arabic benchmark:",
	"",
	f"```text\n{commands['nileTtsExternal']}\n```",
	"",
	"Chatterbox-Multilingual external benchmark:",
	"",
	f"```text\n{commands['chatterboxMultilingualExternal']}\n```",
	"",
	"Chatterbox-Multilingual ONNX external benchmark:",
	"",
	f"```text\n{commands['chatterboxMultilingualOnnxExternal']}\n```",
	"",
	"tts-arabic-onnx external benchmark:",
	"",
	f"```text\n{commands['ttsArabicOnnxExternal']}\n```",
	"",
	"Spark-TTS Arabic external benchmark:",
	"",
	f"```text\n{commands['sparkTtsArabicExternal']}\n```",
	"",
	"Sofelia-TTS external benchmark:",
	"",
	f"```text\n{commands['sofeliaTtsExternal']}\n```",
	"",
	"Arabic-F5-TTS-v2 caution:",
	"",
	f"```text\n{commands['arabicF5TtsCaution']}\n```",
	"",
	"3arab-TTS 500M external benchmark:",
	"",
	f"```text\n{commands['threeArabTtsExternal']}\n```",
	"",
	"VoxCPM2 external benchmark:",
	"",
	f"```text\n{commands['voxcpm2External']}\n```",
	"",
	"Voxtral TTS caution:",
	"",
	f"```text\n{commands['voxtralTtsCaution']}\n```",
	"",
	"Qwen3-TTS caution:",
	"",
	f"```text\n{commands['qwen3TtsCaution']}\n```",
	"",
	"## Listening Checklist",
	"",
	"- Arabic pronunciation is clear and not robotic.",
	"- Pauses are comfortable for long book passages.",
	"- Numbers, Quranic symbols, and punctuation are not read strangely.",
	"- Runtime is acceptable before processing a full book.",
	"- Replace placeholder candidate names and licenses in promotion-gate commands before changing the production default.",
	]
	path.parent.mkdir(parents=True, exist_ok=True)
	path.write_text("\n".join(lines) + "\n", encoding="utf-8")


	def export_tts_sample(
	pdf_path: Path,
	out_dir: Path = ROOT_DIR / "outputs" / "external-tts-sample",
	max_chars: int = 1200,
	chunk_size: int = main.CLOUD_TTS_MAX_CHARS,
	ocr_engine: str \| None = None,
	from_extraction: str \| None = None,
	env_file: Path \| None = None,
	write_report: bool = True,
	) -> dict[str, Any]:
	dry_run = dry_run_pdf(
	pdf_path,
	chunk_size=chunk_size,
	ocr_engine=ocr_engine,
	from_extraction=from_extraction,
	env_file=env_file,
	include_speech_text=True,
	speech_sample_chars=max_chars,
	)
	if not dry_run["readyForTts"]:
	reasons = "; ".join(str(reason) for reason in dry_run.get("qualityReasons", []))
	raise RuntimeError(f"OCR text is not ready for TTS. {reasons}".strip())

	sample_text = str(dry_run["speechSampleText"]).strip()
	out_dir.mkdir(parents=True, exist_ok=True)
	text_path = out_dir / "arabic-tts-sample.txt"
	text_path.write_text(sample_text + "\n", encoding="utf-8")
	commands = build_external_commands(text_path, out_dir)
	result: dict[str, Any] = {
	"ready": True,
	"pdf": str(pdf_path),
	"textPath": str(text_path),
	"reportPath": str(out_dir / "external-tts-sample.md"),
	"characters": len(sample_text),
	"fullSpeechCharacters": dry_run["speechCharacters"],
	"arabicWords": dry_run["arabicWords"],
	"quality": dry_run["quality"],
	"qualityScore": dry_run["qualityScore"],
	"qualityReasons": dry_run["qualityReasons"],
	"ocrEngine": dry_run["ocrEngine"],
	"extraction": dry_run["extraction"],
	"commands": commands,
	}
	if write_report:
	write_markdown_report(out_dir / "external-tts-sample.md", result)
	return result


	def main_cli() -> None:
	if hasattr(sys.stdout, "reconfigure"):
	sys.stdout.reconfigure(encoding="utf-8", errors="replace")
	parser = argparse.ArgumentParser(description="Export the same cleaned Arabic text sample for external TTS benchmarking.")
	parser.add_argument("pdf", type=Path, help="Arabic PDF to extract a cleaned speech sample from.")
	parser.add_argument("--out-dir", type=Path, default=ROOT_DIR / "outputs" / "external-tts-sample")
	parser.add_argument("--max-chars", type=int, default=1200, help="Maximum cleaned characters to export.")
	parser.add_argument("--chunk-size", type=int, default=main.CLOUD_TTS_MAX_CHARS)
	parser.add_argument("--ocr-engine", choices=sorted(main.OCR_ENGINE_CHOICES), help="OCR engine to test.")
	parser.add_argument("--from-extraction", help="Apply settings from a benchmark extraction label.")
	parser.add_argument("--env-file", type=Path, help="Load OCR settings from a generated OCR .env snippet.")
	parser.add_argument("--no-report", action="store_true", help="Only write the text file.")
	parser.add_argument("--json", action="store_true", help="Print JSON instead of a compact summary.")
	args = parser.parse_args()

	result = export_tts_sample(
	args.pdf,
	out_dir=args.out_dir,
	max_chars=args.max_chars,
	chunk_size=args.chunk_size,
	ocr_engine=args.ocr_engine,
	from_extraction=args.from_extraction,
	env_file=args.env_file,
	write_report=not args.no_report,
	)
	if args.json:
	print(json.dumps(result, ensure_ascii=False, indent=2))
	else:
	print(f"Wrote Arabic TTS sample: {result['textPath']}")
	if not args.no_report:
	print(f"Wrote benchmark handoff: {result['reportPath']}")


	if __name__ == "__main__":
	main_cli()