Spaces:

Syncre
/

arabic-audio-reader-worker

Running

File size: 17,512 Bytes

2e1a095

from __future__ import annotations

import argparse
import json
import sys
from pathlib import Path
from typing import Any

ROOT_DIR = Path(__file__).resolve().parent.parent
if str(ROOT_DIR) not in sys.path:
    sys.path.insert(0, str(ROOT_DIR))

from app import main
from scripts.dry_run_pdf import dry_run_pdf


def safe_command_path(path: Path) -> str:
    text = str(path)
    return f'"{text}"' if " " in text else text


def build_external_commands(text_path: Path, output_dir: Path) -> dict[str, str]:
    text_arg = safe_command_path(text_path)
    output_arg = safe_command_path(output_dir)
    return {
        "localVoiceBenchmark": (
            f"python scripts\\benchmark_voices.py --text-file {text_arg} "
            f"--out-dir {output_arg}\\local-voices --write-report {output_arg}\\local-voices.md"
        ),
        "mossTtsNanoOnnx": (
            "moss-tts-nano generate --backend onnx --language ar "
            f"--text-file {text_arg} --prompt-speech C:\\path\\to\\arabic-reference.wav"
        ),
        "mossTtsNanoServer": "moss-tts-nano serve --backend onnx",
        "supertonicLocal": (
            f"python scripts\\benchmark_voices.py --voices supertonic-ar --text-file {text_arg} "
            f"--out-dir {output_arg}\\supertonic --write-report {output_arg}\\supertonic.md"
        ),
        "mishkalaTashkeelExternal": (
            "Diacritize the same cleaned sample with flokymind/mishkala and save it beside the plain sample, "
            f"for example {output_arg}\\arabic-tts-sample-mishkala.txt. Then synthesize both files with the same voice."
        ),
        "mishkalaVoiceBenchmark": (
            f"python scripts\\benchmark_voices.py --text-file {output_arg}\\arabic-tts-sample-mishkala.txt "
            f"--out-dir {output_arg}\\mishkala-local-voices --write-report {output_arg}\\mishkala-local-voices.md"
        ),
        "tashkeel350External": (
            "Diacritize the same cleaned sample with Etherll/Tashkeel-350M and save it beside the plain sample, "
            f"for example {output_arg}\\arabic-tts-sample-tashkeel350.txt. Then synthesize plain, Mishkala, "
            "and Tashkeel-350M samples with the same voice before choosing a preprocessor."
        ),
        "tashkeel350VoiceBenchmark": (
            f"python scripts\\benchmark_voices.py --text-file {output_arg}\\arabic-tts-sample-tashkeel350.txt "
            f"--out-dir {output_arg}\\tashkeel350-local-voices --write-report {output_arg}\\tashkeel350-local-voices.md"
        ),
        "preprocessorListeningScore": (
            "python scripts\\score_tts_preprocessor.py "
            "--rating plain=4,5,5,4,4 --rating mishkala=5,4,4,4,4 --rating tashkeel350=5,4,4,4,4 "
            f"--write-report {output_arg}\\tts-preprocessor-score.md "
            f"--write-json {output_arg}\\tts-preprocessor-score.json"
        ),
        "voiceListeningScore": (
            "python scripts\\score_voice_listening.py "
            "--rating silma-local=5,4,4,5,5 --rating espeak-ar-clear=3,2,4,3,5 "
            f"--write-report {output_arg}\\voice-listening-score.md "
            f"--write-json {output_arg}\\voice-listening-score.json"
        ),
        "voicePromotionGate": (
            "python scripts\\model_promotion_gate.py "
            "--candidate-name \"External voice winner\" --kind tts --license Apache-2.0 "
            f"--score-json {output_arg}\\voice-listening-score.json "
            "--same-sample --runtime-ok --privacy-ok --human-reviewed "
            f"--write-report {output_arg}\\voice-promotion-gate.md"
        ),
        "preprocessorPromotionGate": (
            "python scripts\\model_promotion_gate.py "
            "--candidate-name \"TTS preprocessor winner\" --kind preprocessor --license Apache-2.0 "
            f"--score-json {output_arg}\\tts-preprocessor-score.json "
            "--same-sample --runtime-ok --privacy-ok --human-reviewed "
            f"--write-report {output_arg}\\preprocessor-promotion-gate.md"
        ),
        "omniVoiceExternal": (
            "python -m omnivoice.cli "
            f"--model k2-fsa/OmniVoice --language ar --text-file {text_arg} "
            "--ref-audio C:\\path\\to\\arabic-reference.wav --ref-text \"Arabic reference transcript\""
        ),
        "omniVoiceArabicLoraExternal": (
            "Run OmniVoice with the Arabic LoRA adapter vivooglobal/omnivoice-lora-ar on the same text and "
            "reference audio after the base OmniVoice benchmark works."
        ),
        "tadaExternal": (
            "Benchmark HumeAI/tada-3b-ml externally with language=\"ar\" on the same cleaned sample "
            f"{text_arg}. It is designed to reduce off-script speech, but it reports the Llama 3.2 "
            "license and is a 3B-class strong-worker path, so keep it outside the permissive default."
        ),
        "lahgtnaChatterboxExternal": (
            "python -m chatterbox.tts "
            f"--model oddadmix/lahgtna-chatterbox-v1 --text-file {text_arg} "
            "--audio-prompt-path C:\\path\\to\\arabic-reference.wav --repetition-penalty 1.25"
        ),
        "namaaSaudiTtsExternal": (
            "Use ChatterboxMultilingualTTS with the NAMAA-Space/NAMAA-Saudi-TTS safetensors on "
            f"{text_arg}, language_id=\"ar\", and the same optional reference audio. Benchmark only for "
            "Saudi/Gulf dialect fit, then compare against SILMA, Habibi, Saudi Arabic Qwen3-TTS, and Emirati voices."
        ),
        "saudiChatterboxFineTuneExternal": (
            "Use ChatterboxMultilingualTTS with FatimahEmadEldin/saudi-tts-chatterbox-finetuned T3 weights on "
            f"{text_arg}, language_id=\"ar\", and the same optional reference audio. Benchmark only for "
            "Saudi/Gulf dialect fit, then compare against NAMAA-Saudi-TTS, SILMA, Habibi, Saudi Arabic Qwen3-TTS, "
            "and Emirati voices."
        ),
        "nileTtsExternal": (
            "Benchmark KickItLikeShika/NileTTS-XTTS only for Egyptian/dialectal Arabic using this same cleaned "
            f"sample {text_arg}. It is Apache-2.0, but not an MSA book default."
        ),
        "chatterboxMultilingualExternal": (
            "Use ChatterboxMultilingualTTS.from_pretrained(...).generate(text, language_id=\"ar\", "
            "audio_prompt_path=\"C:\\path\\to\\arabic-reference.wav\") against "
            f"{text_arg}; compare pacing and pronunciation against SILMA and Habibi."
        ),
        "chatterboxMultilingualOnnxExternal": (
            "Benchmark onnx-community/chatterbox-multilingual-ONNX with language_id=\"ar\" against "
            f"{text_arg}; compare CPU/ONNX runtime, repetition, pacing, and pronunciation against SILMA, "
            "Habibi, and the regular Chatterbox-Multilingual path."
        ),
        "ttsArabicOnnxExternal": (
            "Benchmark nipponjo/tts-arabic-onnx with the same cleaned sample "
            f"{text_arg}; try FastPitch, MixerTTS, speaker IDs, pace, and vowelizer options, then compare "
            "CPU runtime and pronunciation against SILMA, Supertonic, MOSS-TTS-Nano, and Chatterbox ONNX. "
            "Confirm model/repo licensing before production use."
        ),
        "sparkTtsArabicExternal": (
            "Spark-TTS Arabic requires the Spark-TTS repo plus diacritized Arabic/reference audio; benchmark it "
            f"externally with {text_arg} only after preparing that reference workflow."
        ),
        "sofeliaTtsExternal": (
            "Sofelia-TTS is a Palestinian Arabic/MiraTTS voice-cloning path; benchmark it only for dialectal text "
            f"using the same sample {text_arg}."
        ),
        "arabicF5TtsCaution": (
            "Arabic-F5-TTS-v2 is non-commercial and requires fully diacritized Arabic; keep it to personal "
            "experiments unless that license and input requirement are acceptable."
        ),
        "threeArabTtsExternal": (
            "Benchmark sherif1313/3arab-TTS-500M-v1 and the VoiceDesign variant on this same cleaned Arabic "
            f"sample: {text_arg}. It is Apache-2.0 and Arabic-only, but new enough that listenability and "
            "long-form stability need manual checks before app wiring."
        ),
        "voxcpm2External": f"Use {text_arg} as the exact Arabic text sample when testing VoxCPM2 externally.",
        "voxtralTtsCaution": (
            "Voxtral TTS supports Arabic on its model card, but it is CC-BY-NC-4.0 and GPU-heavy; "
            f"use {text_arg} only for personal/non-commercial strong-worker listening comparisons."
        ),
        "qwen3TtsCaution": (
            "Do not promote Qwen3-TTS for this Arabic reader until an official Arabic-capable checkpoint "
            "or Arabic fine-tune is verified on this same sample."
        ),
    }


def write_markdown_report(path: Path, result: dict[str, Any]) -> None:
    commands = result["commands"]
    lines = [
        "# External Arabic TTS Sample",
        "",
        f"PDF: `{result.get('pdf', '-')}`",
        f"Text file: `{result['textPath']}`",
        f"Characters: {result['characters']}",
        f"Arabic words: {result['arabicWords']}",
        f"OCR extraction: `{result.get('extraction', '-')}`",
        f"Quality: `{result['quality']}`",
        "",
        "Use this same cleaned Arabic text for every voice/model comparison. Do not compare voices with different OCR text.",
        "",
        "## Commands",
        "",
        "Local installed voices:",
        "",
        f"```powershell\n{commands['localVoiceBenchmark']}\n```",
        "",
        "MOSS-TTS-Nano ONNX external benchmark:",
        "",
        f"```powershell\n{commands['mossTtsNanoOnnx']}\n```",
        "",
        "MOSS-TTS-Nano local server:",
        "",
        f"```powershell\n{commands['mossTtsNanoServer']}\n```",
        "",
        "Supertonic 3 local CPU benchmark:",
        "",
        f"```powershell\n{commands['supertonicLocal']}\n```",
        "",
        "Mishkala Tashkeel pronunciation preprocessor:",
        "",
        f"```text\n{commands['mishkalaTashkeelExternal']}\n```",
        "",
        "Mishkala local voice benchmark:",
        "",
        f"```powershell\n{commands['mishkalaVoiceBenchmark']}\n```",
        "",
        "Tashkeel-350M pronunciation preprocessor:",
        "",
        f"```text\n{commands['tashkeel350External']}\n```",
        "",
        "Tashkeel-350M local voice benchmark:",
        "",
        f"```powershell\n{commands['tashkeel350VoiceBenchmark']}\n```",
        "",
        "Plain vs Mishkala vs Tashkeel-350M listening score:",
        "",
        f"```powershell\n{commands['preprocessorListeningScore']}\n```",
        "",
        "Preprocessor promotion gate:",
        "",
        f"```powershell\n{commands['preprocessorPromotionGate']}\n```",
        "",
        "Voice listening score:",
        "",
        f"```powershell\n{commands['voiceListeningScore']}\n```",
        "",
        "Voice promotion gate:",
        "",
        f"```powershell\n{commands['voicePromotionGate']}\n```",
        "",
        "OmniVoice external benchmark:",
        "",
        f"```powershell\n{commands['omniVoiceExternal']}\n```",
        "",
        "OmniVoice Arabic LoRA external benchmark:",
        "",
        f"```text\n{commands['omniVoiceArabicLoraExternal']}\n```",
        "",
        "TADA multilingual external benchmark:",
        "",
        f"```text\n{commands['tadaExternal']}\n```",
        "",
        "Lahgtna Chatterbox external benchmark:",
        "",
        f"```powershell\n{commands['lahgtnaChatterboxExternal']}\n```",
        "",
        "NAMAA-Saudi-TTS external benchmark:",
        "",
        f"```text\n{commands['namaaSaudiTtsExternal']}\n```",
        "",
        "Saudi Chatterbox fine-tune external benchmark:",
        "",
        f"```text\n{commands['saudiChatterboxFineTuneExternal']}\n```",
        "",
        "NileTTS-XTTS Egyptian Arabic benchmark:",
        "",
        f"```text\n{commands['nileTtsExternal']}\n```",
        "",
        "Chatterbox-Multilingual external benchmark:",
        "",
        f"```text\n{commands['chatterboxMultilingualExternal']}\n```",
        "",
        "Chatterbox-Multilingual ONNX external benchmark:",
        "",
        f"```text\n{commands['chatterboxMultilingualOnnxExternal']}\n```",
        "",
        "tts-arabic-onnx external benchmark:",
        "",
        f"```text\n{commands['ttsArabicOnnxExternal']}\n```",
        "",
        "Spark-TTS Arabic external benchmark:",
        "",
        f"```text\n{commands['sparkTtsArabicExternal']}\n```",
        "",
        "Sofelia-TTS external benchmark:",
        "",
        f"```text\n{commands['sofeliaTtsExternal']}\n```",
        "",
        "Arabic-F5-TTS-v2 caution:",
        "",
        f"```text\n{commands['arabicF5TtsCaution']}\n```",
        "",
        "3arab-TTS 500M external benchmark:",
        "",
        f"```text\n{commands['threeArabTtsExternal']}\n```",
        "",
        "VoxCPM2 external benchmark:",
        "",
        f"```text\n{commands['voxcpm2External']}\n```",
        "",
        "Voxtral TTS caution:",
        "",
        f"```text\n{commands['voxtralTtsCaution']}\n```",
        "",
        "Qwen3-TTS caution:",
        "",
        f"```text\n{commands['qwen3TtsCaution']}\n```",
        "",
        "## Listening Checklist",
        "",
        "- Arabic pronunciation is clear and not robotic.",
        "- Pauses are comfortable for long book passages.",
        "- Numbers, Quranic symbols, and punctuation are not read strangely.",
        "- Runtime is acceptable before processing a full book.",
        "- Replace placeholder candidate names and licenses in promotion-gate commands before changing the production default.",
    ]
    path.parent.mkdir(parents=True, exist_ok=True)
    path.write_text("\n".join(lines) + "\n", encoding="utf-8")


def export_tts_sample(
    pdf_path: Path,
    out_dir: Path = ROOT_DIR / "outputs" / "external-tts-sample",
    max_chars: int = 1200,
    chunk_size: int = main.CLOUD_TTS_MAX_CHARS,
    ocr_engine: str | None = None,
    from_extraction: str | None = None,
    env_file: Path | None = None,
    write_report: bool = True,
) -> dict[str, Any]:
    dry_run = dry_run_pdf(
        pdf_path,
        chunk_size=chunk_size,
        ocr_engine=ocr_engine,
        from_extraction=from_extraction,
        env_file=env_file,
        include_speech_text=True,
        speech_sample_chars=max_chars,
    )
    if not dry_run["readyForTts"]:
        reasons = "; ".join(str(reason) for reason in dry_run.get("qualityReasons", []))
        raise RuntimeError(f"OCR text is not ready for TTS. {reasons}".strip())

    sample_text = str(dry_run["speechSampleText"]).strip()
    out_dir.mkdir(parents=True, exist_ok=True)
    text_path = out_dir / "arabic-tts-sample.txt"
    text_path.write_text(sample_text + "\n", encoding="utf-8")
    commands = build_external_commands(text_path, out_dir)
    result: dict[str, Any] = {
        "ready": True,
        "pdf": str(pdf_path),
        "textPath": str(text_path),
        "reportPath": str(out_dir / "external-tts-sample.md"),
        "characters": len(sample_text),
        "fullSpeechCharacters": dry_run["speechCharacters"],
        "arabicWords": dry_run["arabicWords"],
        "quality": dry_run["quality"],
        "qualityScore": dry_run["qualityScore"],
        "qualityReasons": dry_run["qualityReasons"],
        "ocrEngine": dry_run["ocrEngine"],
        "extraction": dry_run["extraction"],
        "commands": commands,
    }
    if write_report:
        write_markdown_report(out_dir / "external-tts-sample.md", result)
    return result


def main_cli() -> None:
    if hasattr(sys.stdout, "reconfigure"):
        sys.stdout.reconfigure(encoding="utf-8", errors="replace")
    parser = argparse.ArgumentParser(description="Export the same cleaned Arabic text sample for external TTS benchmarking.")
    parser.add_argument("pdf", type=Path, help="Arabic PDF to extract a cleaned speech sample from.")
    parser.add_argument("--out-dir", type=Path, default=ROOT_DIR / "outputs" / "external-tts-sample")
    parser.add_argument("--max-chars", type=int, default=1200, help="Maximum cleaned characters to export.")
    parser.add_argument("--chunk-size", type=int, default=main.CLOUD_TTS_MAX_CHARS)
    parser.add_argument("--ocr-engine", choices=sorted(main.OCR_ENGINE_CHOICES), help="OCR engine to test.")
    parser.add_argument("--from-extraction", help="Apply settings from a benchmark extraction label.")
    parser.add_argument("--env-file", type=Path, help="Load OCR settings from a generated OCR .env snippet.")
    parser.add_argument("--no-report", action="store_true", help="Only write the text file.")
    parser.add_argument("--json", action="store_true", help="Print JSON instead of a compact summary.")
    args = parser.parse_args()

    result = export_tts_sample(
        args.pdf,
        out_dir=args.out_dir,
        max_chars=args.max_chars,
        chunk_size=args.chunk_size,
        ocr_engine=args.ocr_engine,
        from_extraction=args.from_extraction,
        env_file=args.env_file,
        write_report=not args.no_report,
    )
    if args.json:
        print(json.dumps(result, ensure_ascii=False, indent=2))
    else:
        print(f"Wrote Arabic TTS sample: {result['textPath']}")
        if not args.no_report:
            print(f"Wrote benchmark handoff: {result['reportPath']}")


if __name__ == "__main__":
    main_cli()