Spaces:

abreza
/

mana-tts

Running on Zero

File size: 7,756 Bytes

import gradio as gr
from config import custom_css
from synthesis import generate_speech
from GE2PE import GE2PE

MODEL_PATHS = {
    "Homo-GE2PE": "./homo-ge2pe",
    "Homo-T5": "./homo-t5",
}

_g2p_cache = {}

def _get_g2p(model_name: str) -> GE2PE:
    if model_name not in _g2p_cache:
        path = MODEL_PATHS.get(model_name)
        if path is None:
            raise ValueError(f"Unknown model: {model_name}")
        _g2p_cache[model_name] = GE2PE(model_path=path, GPU=False)
    return _g2p_cache[model_name]


def ge2pe_infer(model_name: str, text: str, use_rules: bool, use_dict: bool):
    if not text or not text.strip():
        return ""
    try:
        model = _get_g2p(model_name)
        result = model.generate([text], use_rules=use_rules, use_dict=use_dict)
        return result[0] if result else ""
    except Exception as e:
        return f"⚠️ Error: {str(e)}"


def create_interface():
    with gr.Blocks(title="Persian Speech Suite", css=custom_css) as demo:
        gr.Markdown(
            "# Persian Speech Suite: GE2PE & TTS\n"
            "A unified playground for Persian grapheme‑to‑phoneme conversion (GE2PE) **and** text‑to‑speech synthesis (Mana TTS).\n\n"
            "✨ **Now supports long texts!** The TTS system automatically splits long texts into natural segments. And also converts numbers to Persian text for better pronunciation."
        )

        with gr.Tabs():
            with gr.TabItem("Grapheme → Phoneme (GE2PE)"):
                gr.Markdown(
                    "Convert Persian text to its phonemic transcription. "
                    "Choose between **Homo‑GE2PE** and **Homo‑T5**, optionally applying short‑vowel rules and/or a custom dictionary."
                )

                with gr.Row():
                    model_selector = gr.Radio(
                        choices=list(MODEL_PATHS.keys()),
                        value="Homo-GE2PE",
                        label="G2P Model",
                    )

                g2p_input = gr.Textbox(
                    label="Persian Text",
                    placeholder="مثال: این کتابِ علی است",
                    lines=4,
                )

                with gr.Row():
                    g2p_use_rules = gr.Checkbox(value=True, label="Apply short‑vowel rules (optional)")
                    g2p_use_dict = gr.Checkbox(value=False, label="Use custom dictionary (optional)")

                g2p_button = gr.Button("Convert", variant="primary")
                g2p_output = gr.Textbox(label="Phoneme Output", interactive=False)

                g2p_button.click(
                    fn=ge2pe_infer,
                    inputs=[model_selector, g2p_input, g2p_use_rules, g2p_use_dict],
                    outputs=[g2p_output],
                )

                gr.Examples(
                    examples=[
                        ["او مرد خوبی است."],
                        ["او مرد."],
                        ["این کتابِ علی است."],
                        ["به خانه آمد."]
                    ],
                    inputs=[g2p_input],
                )

            with gr.TabItem("Text‑to‑Speech"):
                gr.Markdown(
                    "Generate natural‑sounding Persian speech from your text using Tacotron2 + HiFiGAN.\n\n"
                    "✨ **New features:**\n"
                    "- **Long text support:** Automatically splits text into natural segments with optional pauses\n"
                    "- **Smart number conversion:** Numbers (۱۴۰۲, 2025, ۵۰۰۰) are automatically converted to text\n"
                )

                with gr.Row():
                    with gr.Column(scale=2):
                        tts_input = gr.Textbox(
                            label="Persian Text",
                            placeholder="متن فارسی خود را اینجا بنویسید...",
                            lines=8,
                        )

                        with gr.Row():
                            tts_add_pauses = gr.Checkbox(
                                value=True,
                                label="Add pauses between segments",
                                info="Adds 300ms pause between text segments for natural flow"
                            )

                        tts_button = gr.Button("Generate Speech", variant="primary", size="lg")

                tts_output = gr.Audio(label="Generated Speech", type="filepath")

                tts_button.click(
                    fn=generate_speech,
                    inputs=[tts_input, gr.State(None), tts_add_pauses],
                    outputs=[tts_output],
                )

                gr.Examples(
                    examples=[
                        ["سلام، چطور هستید؟"],
                        ["ایران سرزمین زیبایی‌ها و افتخارات است."],
                        ["فناوری هوش مصنوعی به سرعت در حال پیشرفت است."],
                        ["مدل تولید گفتار با دادگان نسل مانا"],
                        ["در سال 1402 تعداد 5000 دانشجو در دانشگاه ثبت‌نام کردند."],
                        ["شماره تماس من 912 345 6789 است."],
                        [
                            "هوش مصنوعی یکی از شگفت‌انگیزترین دستاوردهای بشر در قرن بیست و یکم است. "
                            "این فناوری توانایی یادگیری، استدلال و حل مسئله را به ماشین‌ها می‌دهد. "
                            "از پردازش زبان طبیعی گرفته تا بینایی کامپیوتری، هوش مصنوعی در حال تغییر دنیای ماست."
                        ],
                    ],
                    inputs=[tts_input],
                )

        gr.Markdown(
            """
            ### Acknowledgments

            - [**Nasl‑e‑Mana**](https://naslemana.com/), the monthly magazine of the blind community of Iran
            - [ManaTTS Dataset](https://huggingface.co/datasets/MahtaFetrat/Mana-TTS)
            - [Persian‑MultiSpeaker‑Tacotron2](https://github.com/MahtaFetrat/Persian-MultiSpeaker-Tacotron2/)
            - [Homo-GE2PE (Github)](https://github.com/MahtaFetrat/Homo-GE2PE-Persian/)
            - [Base GE2PE Paper](https://aclanthology.org/2024.findings-emnlp.196/)
            - [Base GE2PE Model](https://github.com/Sharif-SLPL/GE2PE)
            - [HomoRich Dataset (Huggingface)](https://huggingface.co/datasets/MahtaFetrat/HomoRich-G2P-Persian)
            - [HomoRich Dataset (Github)](https://github.com/MahtaFetrat/HomoRich-G2P-Persian)
            - [SentenceBench Persian G2P Benchmark](https://huggingface.co/datasets/MahtaFetrat/SentenceBench)
            ### Citation

            ```bibtex
            @misc{qharabagh2025fastfancyrethinkingg2p,
              title={Fast, Not Fancy: Rethinking G2P with Rich Data and Rule-Based Models},
              author={Mahta Fetrat Qharabagh and Zahra Dehghanian and Hamid R. Rabiee},
              year={2025},
              eprint={2505.12973},
              archivePrefix={arXiv},
              primaryClass={cs.CL},
            }

            @article{fetrat2024manatts,
              title={ManaTTS Persian: A Recipe for Creating TTS Datasets for Lower-Resource Languages},
              author={Mahta Fetrat Qharabagh and Zahra Dehghanian and Hamid R. Rabiee},
              journal={arXiv preprint arXiv:2409.07259},
              year={2024},
            }
            ```
            """
        )

    return demo