from __future__ import annotations import argparse import json import sys import tempfile from pathlib import Path from typing import Any ROOT_DIR = Path(__file__).resolve().parent.parent if str(ROOT_DIR) not in sys.path: sys.path.insert(0, str(ROOT_DIR)) from app import main from scripts.verify_voice import DEFAULT_SAMPLE_TEXT, synthesize_voice DEFAULT_VOICES = ["silma-local", "habibi-msa", "supertonic-ar", "espeak-ar-clear"] RECOMMENDED_VOICE_ORDER = ["silma-local", "habibi-msa", "supertonic-ar", "espeak-ar-clear", "espeak-ar", "espeak-ar-male"] def available_voice_ids() -> list[str]: return list(main.LOCAL_VOICES) def benchmark_voice( voice_id: str, text: str, output_dir: Path, audio_format: str = "wav", ) -> dict[str, Any]: output_dir.mkdir(parents=True, exist_ok=True) destination = output_dir / f"{voice_id}.{audio_format}" try: result = synthesize_voice(text, voice_id, destination, audio_format) return {"ok": True, **result} except Exception as exc: return { "ok": False, "voiceId": voice_id, "label": main.get_local_voice(voice_id).get("label", voice_id), "engine": main.get_local_voice(voice_id).get("engine", ""), "error": str(exc), } def benchmark_voices( voices: list[str] | None = None, text: str = DEFAULT_SAMPLE_TEXT, output_dir: Path | None = None, audio_format: str = "wav", ) -> dict[str, Any]: selected_voices = voices or DEFAULT_VOICES output = output_dir or (ROOT_DIR / "outputs" / "voice-benchmark") results = [benchmark_voice(voice_id, text, output, audio_format) for voice_id in selected_voices] successful = [item for item in results if item.get("ok")] fastest = min(successful, key=lambda item: item.get("elapsedSeconds", float("inf"))) if successful else None recommended = choose_recommended_voice(successful) return { "voices": selected_voices, "textCharacters": len(main.prepare_text_for_speech(text)), "outputDir": str(output), "audioFormat": audio_format, "results": results, "fastest": fastest, "recommended": recommended, "ready": bool(successful), "nextSteps": build_next_steps(results), } def choose_recommended_voice(successful: list[dict[str, Any]]) -> dict[str, Any] | None: if not successful: return None by_voice_id = {str(item.get("voiceId")): item for item in successful} for voice_id in RECOMMENDED_VOICE_ORDER: if voice_id in by_voice_id: return by_voice_id[voice_id] return min(successful, key=lambda item: item.get("elapsedSeconds", float("inf"))) def build_next_steps(results: list[dict[str, Any]]) -> list[str]: steps: list[str] = [] successful = [item for item in results if item.get("ok")] failed = [item for item in results if not item.get("ok")] if successful: recommended = choose_recommended_voice(successful) if recommended: steps.append( f"Recommended starting voice: {recommended.get('voiceId')} ({recommended.get('label')}). " "Listen before processing a full book." ) steps.append("Listen to the generated files and choose the most natural Arabic voice before processing a full book.") fastest = min(successful, key=lambda item: item.get("elapsedSeconds", float("inf"))) steps.append(f"Fastest successful voice in this run: {fastest.get('voiceId')} ({fastest.get('elapsedSeconds')}s).") else: steps.append("No tested voice produced audio. Run scripts/preflight_check.py and install SILMA, Habibi, Supertonic, or eSpeak NG.") if any(item.get("voiceId") == "habibi-msa" and not item.get("ok") for item in failed): steps.append("Habibi MSA is optional. Install it with scripts/setup_habibi.ps1 if you want to compare the newer MSA voice.") if any(item.get("voiceId") == "silma-local" and not item.get("ok") for item in failed): steps.append("SILMA is the preferred default voice. Install it with scripts/setup_silma.ps1 for the best current local baseline.") if any(item.get("voiceId") == "supertonic-ar" and not item.get("ok") for item in failed): steps.append("Supertonic 3 is optional. Install it with scripts/setup_supertonic.ps1 when you want a fast CPU Arabic-capable comparison voice.") return steps def write_markdown_report(path: Path, result: dict[str, Any]) -> None: path.parent.mkdir(parents=True, exist_ok=True) rating_command = build_rating_command(result) lines = [ "# Arabic Voice Benchmark", "", f"Output directory: `{result.get('outputDir', '')}`", f"Text characters: {result.get('textCharacters', 0)}", f"Audio format: {result.get('audioFormat', 'wav')}", ] recommended = result.get("recommended") fastest = result.get("fastest") if recommended: lines.append(f"Recommended starting voice: `{recommended.get('voiceId')}` ({recommended.get('label', '-')})") if fastest: lines.append(f"Fastest successful voice: `{fastest.get('voiceId')}` ({fastest.get('elapsedSeconds')}s)") lines.extend( [ "", "## Results", "", "| Voice | Label | Engine | Status | Time | Audio | Notes |", "| --- | --- | --- | --- | ---: | --- | --- |", ] ) for item in result.get("results", []): status = "ok" if item.get("ok") else "failed" elapsed = item.get("elapsedSeconds", "-") audio = item.get("path", "-") notes = item.get("error", "") lines.append( f"| {item.get('voiceId', '-')} | {item.get('label', '-')} | {item.get('engine', '-')} | " f"{status} | {elapsed} | {audio} | {notes} |" ) lines.extend(["", "## Next Steps", ""]) for step in result.get("nextSteps", []): lines.append(f"- {step}") lines.extend( [ "", "## Listening Score", "", "After listening, score each successful voice from 1 to 5 for pronunciation, naturalness, pacing, long-listen comfort, and artifact-free audio.", "", "```powershell", rating_command, "```", ] ) path.write_text("\n".join(lines) + "\n", encoding="utf-8") def build_rating_command(result: dict[str, Any]) -> str: ratings = [] for item in result.get("results", []): if item.get("ok"): ratings.append(f"--rating {item.get('voiceId')}=5,5,5,5,5") if not ratings: ratings.append("--rating silma-local=5,5,5,5,5") return ( "python scripts\\score_voice_listening.py " + " ".join(ratings) + " --write-report outputs\\voice-listening-score.md" ) def print_table(result: dict[str, Any]) -> None: print("voice ok sec engine file") print("---------------- ---- ----- ---------- ----") for item in result["results"]: print( f"{item.get('voiceId', '-'):<16} " f"{str(item.get('ok')):<4} " f"{item.get('elapsedSeconds', '-'):>5} " f"{item.get('engine', '-'):<10} " f"{item.get('path', item.get('error', '-'))}" ) print() for step in result["nextSteps"]: print(f"- {step}") def main_cli() -> None: if hasattr(sys.stdout, "reconfigure"): sys.stdout.reconfigure(encoding="utf-8", errors="replace") parser = argparse.ArgumentParser(description="Compare free local Arabic TTS voices on the same text.") parser.add_argument("--voices", nargs="+", default=DEFAULT_VOICES, choices=available_voice_ids()) parser.add_argument("--text", default=DEFAULT_SAMPLE_TEXT, help="Arabic text to synthesize for every voice.") parser.add_argument("--text-file", type=Path, help="Read benchmark text from a UTF-8 file.") parser.add_argument("--out-dir", type=Path, default=ROOT_DIR / "outputs" / "voice-benchmark") parser.add_argument("--format", choices=["wav", "mp3"], default="wav") parser.add_argument("--write-report", type=Path, help="Write a Markdown benchmark report.") parser.add_argument("--json", action="store_true", help="Print JSON instead of a compact table.") args = parser.parse_args() text = args.text_file.read_text(encoding="utf-8") if args.text_file else args.text result = benchmark_voices(args.voices, text, args.out_dir, args.format) if args.write_report: write_markdown_report(args.write_report, result) if args.json: print(json.dumps(result, ensure_ascii=False, indent=2)) else: print_table(result) if __name__ == "__main__": main_cli()