File size: 7,756 Bytes
10e72d3 eb57aa1 10e72d3 eb57aa1 da2ee9a 766414c da2ee9a eb57aa1 da2ee9a eb57aa1 da2ee9a 766414c 10e72d3 eb57aa1 da2ee9a eb57aa1 da2ee9a eb57aa1 766414c da2ee9a eb57aa1 10e72d3 eb57aa1 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 |
import gradio as gr
from config import custom_css
from synthesis import generate_speech
from GE2PE import GE2PE
MODEL_PATHS = {
"Homo-GE2PE": "./homo-ge2pe",
"Homo-T5": "./homo-t5",
}
_g2p_cache = {}
def _get_g2p(model_name: str) -> GE2PE:
if model_name not in _g2p_cache:
path = MODEL_PATHS.get(model_name)
if path is None:
raise ValueError(f"Unknown model: {model_name}")
_g2p_cache[model_name] = GE2PE(model_path=path, GPU=False)
return _g2p_cache[model_name]
def ge2pe_infer(model_name: str, text: str, use_rules: bool, use_dict: bool):
if not text or not text.strip():
return ""
try:
model = _get_g2p(model_name)
result = model.generate([text], use_rules=use_rules, use_dict=use_dict)
return result[0] if result else ""
except Exception as e:
return f"⚠️ Error: {str(e)}"
def create_interface():
with gr.Blocks(title="Persian Speech Suite", css=custom_css) as demo:
gr.Markdown(
"# Persian Speech Suite: GE2PE & TTS\n"
"A unified playground for Persian grapheme‑to‑phoneme conversion (GE2PE) **and** text‑to‑speech synthesis (Mana TTS).\n\n"
"✨ **Now supports long texts!** The TTS system automatically splits long texts into natural segments. And also converts numbers to Persian text for better pronunciation."
)
with gr.Tabs():
with gr.TabItem("Grapheme → Phoneme (GE2PE)"):
gr.Markdown(
"Convert Persian text to its phonemic transcription. "
"Choose between **Homo‑GE2PE** and **Homo‑T5**, optionally applying short‑vowel rules and/or a custom dictionary."
)
with gr.Row():
model_selector = gr.Radio(
choices=list(MODEL_PATHS.keys()),
value="Homo-GE2PE",
label="G2P Model",
)
g2p_input = gr.Textbox(
label="Persian Text",
placeholder="مثال: این کتابِ علی است",
lines=4,
)
with gr.Row():
g2p_use_rules = gr.Checkbox(value=True, label="Apply short‑vowel rules (optional)")
g2p_use_dict = gr.Checkbox(value=False, label="Use custom dictionary (optional)")
g2p_button = gr.Button("Convert", variant="primary")
g2p_output = gr.Textbox(label="Phoneme Output", interactive=False)
g2p_button.click(
fn=ge2pe_infer,
inputs=[model_selector, g2p_input, g2p_use_rules, g2p_use_dict],
outputs=[g2p_output],
)
gr.Examples(
examples=[
["او مرد خوبی است."],
["او مرد."],
["این کتابِ علی است."],
["به خانه آمد."]
],
inputs=[g2p_input],
)
with gr.TabItem("Text‑to‑Speech"):
gr.Markdown(
"Generate natural‑sounding Persian speech from your text using Tacotron2 + HiFiGAN.\n\n"
"✨ **New features:**\n"
"- **Long text support:** Automatically splits text into natural segments with optional pauses\n"
"- **Smart number conversion:** Numbers (۱۴۰۲, 2025, ۵۰۰۰) are automatically converted to text\n"
)
with gr.Row():
with gr.Column(scale=2):
tts_input = gr.Textbox(
label="Persian Text",
placeholder="متن فارسی خود را اینجا بنویسید...",
lines=8,
)
with gr.Row():
tts_add_pauses = gr.Checkbox(
value=True,
label="Add pauses between segments",
info="Adds 300ms pause between text segments for natural flow"
)
tts_button = gr.Button("Generate Speech", variant="primary", size="lg")
tts_output = gr.Audio(label="Generated Speech", type="filepath")
tts_button.click(
fn=generate_speech,
inputs=[tts_input, gr.State(None), tts_add_pauses],
outputs=[tts_output],
)
gr.Examples(
examples=[
["سلام، چطور هستید؟"],
["ایران سرزمین زیباییها و افتخارات است."],
["فناوری هوش مصنوعی به سرعت در حال پیشرفت است."],
["مدل تولید گفتار با دادگان نسل مانا"],
["در سال 1402 تعداد 5000 دانشجو در دانشگاه ثبتنام کردند."],
["شماره تماس من 912 345 6789 است."],
[
"هوش مصنوعی یکی از شگفتانگیزترین دستاوردهای بشر در قرن بیست و یکم است. "
"این فناوری توانایی یادگیری، استدلال و حل مسئله را به ماشینها میدهد. "
"از پردازش زبان طبیعی گرفته تا بینایی کامپیوتری، هوش مصنوعی در حال تغییر دنیای ماست."
],
],
inputs=[tts_input],
)
gr.Markdown(
"""
### Acknowledgments
- [**Nasl‑e‑Mana**](https://naslemana.com/), the monthly magazine of the blind community of Iran
- [ManaTTS Dataset](https://huggingface.co/datasets/MahtaFetrat/Mana-TTS)
- [Persian‑MultiSpeaker‑Tacotron2](https://github.com/MahtaFetrat/Persian-MultiSpeaker-Tacotron2/)
- [Homo-GE2PE (Github)](https://github.com/MahtaFetrat/Homo-GE2PE-Persian/)
- [Base GE2PE Paper](https://aclanthology.org/2024.findings-emnlp.196/)
- [Base GE2PE Model](https://github.com/Sharif-SLPL/GE2PE)
- [HomoRich Dataset (Huggingface)](https://huggingface.co/datasets/MahtaFetrat/HomoRich-G2P-Persian)
- [HomoRich Dataset (Github)](https://github.com/MahtaFetrat/HomoRich-G2P-Persian)
- [SentenceBench Persian G2P Benchmark](https://huggingface.co/datasets/MahtaFetrat/SentenceBench)
### Citation
```bibtex
@misc{qharabagh2025fastfancyrethinkingg2p,
title={Fast, Not Fancy: Rethinking G2P with Rich Data and Rule-Based Models},
author={Mahta Fetrat Qharabagh and Zahra Dehghanian and Hamid R. Rabiee},
year={2025},
eprint={2505.12973},
archivePrefix={arXiv},
primaryClass={cs.CL},
}
@article{fetrat2024manatts,
title={ManaTTS Persian: A Recipe for Creating TTS Datasets for Lower-Resource Languages},
author={Mahta Fetrat Qharabagh and Zahra Dehghanian and Hamid R. Rabiee},
journal={arXiv preprint arXiv:2409.07259},
year={2024},
}
```
"""
)
return demo
|