# app.py - fixed version for dialog support with edge-tts SSML import os import tempfile import edge_tts import gradio as gr from xml.sax.saxutils import escape def build_ssml_from_dialog(text: str, default_voice: str = "fa-IR-DilaraNeural") -> str: """ Parses input line by line, detects 'مرد:' or 'زن:' prefixes, and wraps each segment in tags with a short break. """ lines = [l.strip() for l in text.splitlines() if l.strip()] if not lines: return "" voice_map = { "مرد:": "fa-IR-FaridNeural", "زن:": "fa-IR-DilaraNeural", } segments = [] for line in lines: chosen_voice = default_voice content = line for prefix, voice_name in voice_map.items(): if line.startswith(prefix): content = line[len(prefix):].strip() chosen_voice = voice_name break content_escaped = escape(content) segments.append(f'{content_escaped}') ssml = '' + "".join(segments) + '' return ssml def process(text: str, rate: int, pitch: int, volume: int) -> str: if not text.strip(): gr.Warning("لطفا متنی وارد کنید.") return "" ssml_text = build_ssml_from_dialog(text, default_voice="fa-IR-DilaraNeural") if not ssml_text: gr.Warning("متن نامعتبر است.") return "" # Convert sliders to strings only if needed rate_str = f"{rate}%" if rate != 0 else None pitch_str = f"{pitch}Hz" if pitch != 0 else None volume_str = f"{volume}%" if volume != 0 else None # Select first voice appearing in SSML first_voice = "fa-IR-DilaraNeural" if "fa-IR-FaridNeural" in ssml_text: first_voice = "fa-IR-FaridNeural" communicate = edge_tts.Communicate( text=ssml_text, voice=first_voice, rate=rate_str or "+0%", pitch=pitch_str or "+0Hz", volume=volume_str or "+0%", ) temp_file = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) temp_name = temp_file.name temp_file.close() # Important: ssml=True to parse voice tags communicate.save_sync(audio_fname=temp_name, ssml=True) return temp_name def main(): os.system("cls" if os.name == "nt" else "clear") text_area = gr.TextArea( lines=8, rtl=True, label="متن (از 'مرد:' و 'زن:' برای دیالوگ استفاده کنید)", placeholder="مثال:\nمرد: سلام\nزن: سلام، خوبی؟\nمرد: خوبم، ممنون.", ) rate_slider = gr.Slider(label="سرعت", step=1, value=0, minimum=-100, maximum=100) pitch_slider = gr.Slider(label="فرکانس", step=1, value=0, minimum=-100, maximum=100) volume_slider = gr.Slider(label="حجم صدا", step=1, value=0, minimum=-100, maximum=100) result_audio = gr.Audio(label="نتیجه") interface = gr.Interface( fn=process, inputs=[text_area, rate_slider, pitch_slider, volume_slider], outputs=[result_audio], title="DT TTS (dialog support)", description="پشتیبانی از خطوط دیالوگ با برچسب 'مرد:' و 'زن:' — هر خط با صدای مربوط تولید می‌شود.", flagging_mode="never", ) interface.launch() if __name__ == "__main__": main()