Spaces:

Syncre
/

arabic-audio-reader-worker

Running

File size: 8,806 Bytes

2e1a095

from __future__ import annotations

import argparse
import json
import sys
import tempfile
from pathlib import Path
from typing import Any

ROOT_DIR = Path(__file__).resolve().parent.parent
if str(ROOT_DIR) not in sys.path:
    sys.path.insert(0, str(ROOT_DIR))

from app import main
from scripts.verify_voice import DEFAULT_SAMPLE_TEXT, synthesize_voice


DEFAULT_VOICES = ["silma-local", "habibi-msa", "supertonic-ar", "espeak-ar-clear"]
RECOMMENDED_VOICE_ORDER = ["silma-local", "habibi-msa", "supertonic-ar", "espeak-ar-clear", "espeak-ar", "espeak-ar-male"]


def available_voice_ids() -> list[str]:
    return list(main.LOCAL_VOICES)


def benchmark_voice(
    voice_id: str,
    text: str,
    output_dir: Path,
    audio_format: str = "wav",
) -> dict[str, Any]:
    output_dir.mkdir(parents=True, exist_ok=True)
    destination = output_dir / f"{voice_id}.{audio_format}"
    try:
        result = synthesize_voice(text, voice_id, destination, audio_format)
        return {"ok": True, **result}
    except Exception as exc:
        return {
            "ok": False,
            "voiceId": voice_id,
            "label": main.get_local_voice(voice_id).get("label", voice_id),
            "engine": main.get_local_voice(voice_id).get("engine", ""),
            "error": str(exc),
        }


def benchmark_voices(
    voices: list[str] | None = None,
    text: str = DEFAULT_SAMPLE_TEXT,
    output_dir: Path | None = None,
    audio_format: str = "wav",
) -> dict[str, Any]:
    selected_voices = voices or DEFAULT_VOICES
    output = output_dir or (ROOT_DIR / "outputs" / "voice-benchmark")
    results = [benchmark_voice(voice_id, text, output, audio_format) for voice_id in selected_voices]
    successful = [item for item in results if item.get("ok")]
    fastest = min(successful, key=lambda item: item.get("elapsedSeconds", float("inf"))) if successful else None
    recommended = choose_recommended_voice(successful)
    return {
        "voices": selected_voices,
        "textCharacters": len(main.prepare_text_for_speech(text)),
        "outputDir": str(output),
        "audioFormat": audio_format,
        "results": results,
        "fastest": fastest,
        "recommended": recommended,
        "ready": bool(successful),
        "nextSteps": build_next_steps(results),
    }


def choose_recommended_voice(successful: list[dict[str, Any]]) -> dict[str, Any] | None:
    if not successful:
        return None
    by_voice_id = {str(item.get("voiceId")): item for item in successful}
    for voice_id in RECOMMENDED_VOICE_ORDER:
        if voice_id in by_voice_id:
            return by_voice_id[voice_id]
    return min(successful, key=lambda item: item.get("elapsedSeconds", float("inf")))


def build_next_steps(results: list[dict[str, Any]]) -> list[str]:
    steps: list[str] = []
    successful = [item for item in results if item.get("ok")]
    failed = [item for item in results if not item.get("ok")]
    if successful:
        recommended = choose_recommended_voice(successful)
        if recommended:
            steps.append(
                f"Recommended starting voice: {recommended.get('voiceId')} ({recommended.get('label')}). "
                "Listen before processing a full book."
            )
        steps.append("Listen to the generated files and choose the most natural Arabic voice before processing a full book.")
        fastest = min(successful, key=lambda item: item.get("elapsedSeconds", float("inf")))
        steps.append(f"Fastest successful voice in this run: {fastest.get('voiceId')} ({fastest.get('elapsedSeconds')}s).")
    else:
        steps.append("No tested voice produced audio. Run scripts/preflight_check.py and install SILMA, Habibi, Supertonic, or eSpeak NG.")
    if any(item.get("voiceId") == "habibi-msa" and not item.get("ok") for item in failed):
        steps.append("Habibi MSA is optional. Install it with scripts/setup_habibi.ps1 if you want to compare the newer MSA voice.")
    if any(item.get("voiceId") == "silma-local" and not item.get("ok") for item in failed):
        steps.append("SILMA is the preferred default voice. Install it with scripts/setup_silma.ps1 for the best current local baseline.")
    if any(item.get("voiceId") == "supertonic-ar" and not item.get("ok") for item in failed):
        steps.append("Supertonic 3 is optional. Install it with scripts/setup_supertonic.ps1 when you want a fast CPU Arabic-capable comparison voice.")
    return steps


def write_markdown_report(path: Path, result: dict[str, Any]) -> None:
    path.parent.mkdir(parents=True, exist_ok=True)
    rating_command = build_rating_command(result)
    lines = [
        "# Arabic Voice Benchmark",
        "",
        f"Output directory: `{result.get('outputDir', '')}`",
        f"Text characters: {result.get('textCharacters', 0)}",
        f"Audio format: {result.get('audioFormat', 'wav')}",
    ]
    recommended = result.get("recommended")
    fastest = result.get("fastest")
    if recommended:
        lines.append(f"Recommended starting voice: `{recommended.get('voiceId')}` ({recommended.get('label', '-')})")
    if fastest:
        lines.append(f"Fastest successful voice: `{fastest.get('voiceId')}` ({fastest.get('elapsedSeconds')}s)")
    lines.extend(
        [
            "",
            "## Results",
            "",
            "| Voice | Label | Engine | Status | Time | Audio | Notes |",
            "| --- | --- | --- | --- | ---: | --- | --- |",
        ]
    )
    for item in result.get("results", []):
        status = "ok" if item.get("ok") else "failed"
        elapsed = item.get("elapsedSeconds", "-")
        audio = item.get("path", "-")
        notes = item.get("error", "")
        lines.append(
            f"| {item.get('voiceId', '-')} | {item.get('label', '-')} | {item.get('engine', '-')} | "
            f"{status} | {elapsed} | {audio} | {notes} |"
        )
    lines.extend(["", "## Next Steps", ""])
    for step in result.get("nextSteps", []):
        lines.append(f"- {step}")
    lines.extend(
        [
            "",
            "## Listening Score",
            "",
            "After listening, score each successful voice from 1 to 5 for pronunciation, naturalness, pacing, long-listen comfort, and artifact-free audio.",
            "",
            "```powershell",
            rating_command,
            "```",
        ]
    )
    path.write_text("\n".join(lines) + "\n", encoding="utf-8")


def build_rating_command(result: dict[str, Any]) -> str:
    ratings = []
    for item in result.get("results", []):
        if item.get("ok"):
            ratings.append(f"--rating {item.get('voiceId')}=5,5,5,5,5")
    if not ratings:
        ratings.append("--rating silma-local=5,5,5,5,5")
    return (
        "python scripts\\score_voice_listening.py "
        + " ".join(ratings)
        + " --write-report outputs\\voice-listening-score.md"
    )


def print_table(result: dict[str, Any]) -> None:
    print("voice             ok    sec    engine      file")
    print("----------------  ----  -----  ----------  ----")
    for item in result["results"]:
        print(
            f"{item.get('voiceId', '-'):<16}  "
            f"{str(item.get('ok')):<4}  "
            f"{item.get('elapsedSeconds', '-'):>5}  "
            f"{item.get('engine', '-'):<10}  "
            f"{item.get('path', item.get('error', '-'))}"
        )
    print()
    for step in result["nextSteps"]:
        print(f"- {step}")


def main_cli() -> None:
    if hasattr(sys.stdout, "reconfigure"):
        sys.stdout.reconfigure(encoding="utf-8", errors="replace")
    parser = argparse.ArgumentParser(description="Compare free local Arabic TTS voices on the same text.")
    parser.add_argument("--voices", nargs="+", default=DEFAULT_VOICES, choices=available_voice_ids())
    parser.add_argument("--text", default=DEFAULT_SAMPLE_TEXT, help="Arabic text to synthesize for every voice.")
    parser.add_argument("--text-file", type=Path, help="Read benchmark text from a UTF-8 file.")
    parser.add_argument("--out-dir", type=Path, default=ROOT_DIR / "outputs" / "voice-benchmark")
    parser.add_argument("--format", choices=["wav", "mp3"], default="wav")
    parser.add_argument("--write-report", type=Path, help="Write a Markdown benchmark report.")
    parser.add_argument("--json", action="store_true", help="Print JSON instead of a compact table.")
    args = parser.parse_args()

    text = args.text_file.read_text(encoding="utf-8") if args.text_file else args.text
    result = benchmark_voices(args.voices, text, args.out_dir, args.format)
    if args.write_report:
        write_markdown_report(args.write_report, result)
    if args.json:
        print(json.dumps(result, ensure_ascii=False, indent=2))
    else:
        print_table(result)


if __name__ == "__main__":
    main_cli()