Ttsnew1 / app.py
Rezfars's picture
Update app.py
6f371a4 verified
# app.py - fixed version for dialog support with edge-tts SSML
import os
import tempfile
import edge_tts
import gradio as gr
from xml.sax.saxutils import escape
def build_ssml_from_dialog(text: str, default_voice: str = "fa-IR-DilaraNeural") -> str:
"""
Parses input line by line, detects 'مرد:' or 'زن:' prefixes,
and wraps each segment in <voice> tags with a short break.
"""
lines = [l.strip() for l in text.splitlines() if l.strip()]
if not lines:
return ""
voice_map = {
"مرد:": "fa-IR-FaridNeural",
"زن:": "fa-IR-DilaraNeural",
}
segments = []
for line in lines:
chosen_voice = default_voice
content = line
for prefix, voice_name in voice_map.items():
if line.startswith(prefix):
content = line[len(prefix):].strip()
chosen_voice = voice_name
break
content_escaped = escape(content)
segments.append(f'<voice name="{chosen_voice}">{content_escaped}<break time="300ms"/></voice>')
ssml = '<?xml version="1.0" encoding="utf-8"?><speak version="1.0" xml:lang="fa-IR">' + "".join(segments) + '</speak>'
return ssml
def process(text: str, rate: int, pitch: int, volume: int) -> str:
if not text.strip():
gr.Warning("لطفا متنی وارد کنید.")
return ""
ssml_text = build_ssml_from_dialog(text, default_voice="fa-IR-DilaraNeural")
if not ssml_text:
gr.Warning("متن نامعتبر است.")
return ""
# Convert sliders to strings only if needed
rate_str = f"{rate}%" if rate != 0 else None
pitch_str = f"{pitch}Hz" if pitch != 0 else None
volume_str = f"{volume}%" if volume != 0 else None
# Select first voice appearing in SSML
first_voice = "fa-IR-DilaraNeural"
if "fa-IR-FaridNeural" in ssml_text:
first_voice = "fa-IR-FaridNeural"
communicate = edge_tts.Communicate(
text=ssml_text,
voice=first_voice,
rate=rate_str or "+0%",
pitch=pitch_str or "+0Hz",
volume=volume_str or "+0%",
)
temp_file = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
temp_name = temp_file.name
temp_file.close()
# Important: ssml=True to parse voice tags
communicate.save_sync(audio_fname=temp_name, ssml=True)
return temp_name
def main():
os.system("cls" if os.name == "nt" else "clear")
text_area = gr.TextArea(
lines=8,
rtl=True,
label="متن (از 'مرد:' و 'زن:' برای دیالوگ استفاده کنید)",
placeholder="مثال:\nمرد: سلام\nزن: سلام، خوبی؟\nمرد: خوبم، ممنون.",
)
rate_slider = gr.Slider(label="سرعت", step=1, value=0, minimum=-100, maximum=100)
pitch_slider = gr.Slider(label="فرکانس", step=1, value=0, minimum=-100, maximum=100)
volume_slider = gr.Slider(label="حجم صدا", step=1, value=0, minimum=-100, maximum=100)
result_audio = gr.Audio(label="نتیجه")
interface = gr.Interface(
fn=process,
inputs=[text_area, rate_slider, pitch_slider, volume_slider],
outputs=[result_audio],
title="DT TTS (dialog support)",
description="پشتیبانی از خطوط دیالوگ با برچسب 'مرد:' و 'زن:' — هر خط با صدای مربوط تولید می‌شود.",
flagging_mode="never",
)
interface.launch()
if __name__ == "__main__":
main()