|
|
|
|
|
import os |
|
|
import tempfile |
|
|
import edge_tts |
|
|
import gradio as gr |
|
|
from xml.sax.saxutils import escape |
|
|
|
|
|
def build_ssml_from_dialog(text: str, default_voice: str = "fa-IR-DilaraNeural") -> str: |
|
|
""" |
|
|
Parses input line by line, detects 'مرد:' or 'زن:' prefixes, |
|
|
and wraps each segment in <voice> tags with a short break. |
|
|
""" |
|
|
lines = [l.strip() for l in text.splitlines() if l.strip()] |
|
|
if not lines: |
|
|
return "" |
|
|
|
|
|
voice_map = { |
|
|
"مرد:": "fa-IR-FaridNeural", |
|
|
"زن:": "fa-IR-DilaraNeural", |
|
|
} |
|
|
|
|
|
segments = [] |
|
|
for line in lines: |
|
|
chosen_voice = default_voice |
|
|
content = line |
|
|
for prefix, voice_name in voice_map.items(): |
|
|
if line.startswith(prefix): |
|
|
content = line[len(prefix):].strip() |
|
|
chosen_voice = voice_name |
|
|
break |
|
|
content_escaped = escape(content) |
|
|
segments.append(f'<voice name="{chosen_voice}">{content_escaped}<break time="300ms"/></voice>') |
|
|
|
|
|
ssml = '<?xml version="1.0" encoding="utf-8"?><speak version="1.0" xml:lang="fa-IR">' + "".join(segments) + '</speak>' |
|
|
return ssml |
|
|
|
|
|
def process(text: str, rate: int, pitch: int, volume: int) -> str: |
|
|
if not text.strip(): |
|
|
gr.Warning("لطفا متنی وارد کنید.") |
|
|
return "" |
|
|
|
|
|
ssml_text = build_ssml_from_dialog(text, default_voice="fa-IR-DilaraNeural") |
|
|
if not ssml_text: |
|
|
gr.Warning("متن نامعتبر است.") |
|
|
return "" |
|
|
|
|
|
|
|
|
rate_str = f"{rate}%" if rate != 0 else None |
|
|
pitch_str = f"{pitch}Hz" if pitch != 0 else None |
|
|
volume_str = f"{volume}%" if volume != 0 else None |
|
|
|
|
|
|
|
|
first_voice = "fa-IR-DilaraNeural" |
|
|
if "fa-IR-FaridNeural" in ssml_text: |
|
|
first_voice = "fa-IR-FaridNeural" |
|
|
|
|
|
communicate = edge_tts.Communicate( |
|
|
text=ssml_text, |
|
|
voice=first_voice, |
|
|
rate=rate_str or "+0%", |
|
|
pitch=pitch_str or "+0Hz", |
|
|
volume=volume_str or "+0%", |
|
|
) |
|
|
|
|
|
temp_file = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) |
|
|
temp_name = temp_file.name |
|
|
temp_file.close() |
|
|
|
|
|
|
|
|
communicate.save_sync(audio_fname=temp_name, ssml=True) |
|
|
|
|
|
return temp_name |
|
|
|
|
|
def main(): |
|
|
os.system("cls" if os.name == "nt" else "clear") |
|
|
|
|
|
text_area = gr.TextArea( |
|
|
lines=8, |
|
|
rtl=True, |
|
|
label="متن (از 'مرد:' و 'زن:' برای دیالوگ استفاده کنید)", |
|
|
placeholder="مثال:\nمرد: سلام\nزن: سلام، خوبی؟\nمرد: خوبم، ممنون.", |
|
|
) |
|
|
|
|
|
rate_slider = gr.Slider(label="سرعت", step=1, value=0, minimum=-100, maximum=100) |
|
|
pitch_slider = gr.Slider(label="فرکانس", step=1, value=0, minimum=-100, maximum=100) |
|
|
volume_slider = gr.Slider(label="حجم صدا", step=1, value=0, minimum=-100, maximum=100) |
|
|
|
|
|
result_audio = gr.Audio(label="نتیجه") |
|
|
|
|
|
interface = gr.Interface( |
|
|
fn=process, |
|
|
inputs=[text_area, rate_slider, pitch_slider, volume_slider], |
|
|
outputs=[result_audio], |
|
|
title="DT TTS (dialog support)", |
|
|
description="پشتیبانی از خطوط دیالوگ با برچسب 'مرد:' و 'زن:' — هر خط با صدای مربوط تولید میشود.", |
|
|
flagging_mode="never", |
|
|
) |
|
|
|
|
|
interface.launch() |
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |