Update app.py
Browse files
app.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
# app.py -
|
| 2 |
import os
|
| 3 |
import tempfile
|
| 4 |
import edge_tts
|
|
@@ -7,11 +7,10 @@ from xml.sax.saxutils import escape
|
|
| 7 |
|
| 8 |
def build_ssml_from_dialog(text: str, default_voice: str = "fa-IR-DilaraNeural") -> str:
|
| 9 |
"""
|
| 10 |
-
Parses
|
| 11 |
-
|
| 12 |
-
Returns an SSML string that wraps each segment in a <voice> tag and adds short breaks.
|
| 13 |
"""
|
| 14 |
-
lines = [l.strip() for l in text.splitlines() if l.strip()
|
| 15 |
if not lines:
|
| 16 |
return ""
|
| 17 |
|
|
@@ -24,128 +23,77 @@ def build_ssml_from_dialog(text: str, default_voice: str = "fa-IR-DilaraNeural")
|
|
| 24 |
for line in lines:
|
| 25 |
chosen_voice = default_voice
|
| 26 |
content = line
|
| 27 |
-
# check explicit prefix (case-sensitive for Persian labels)
|
| 28 |
for prefix, voice_name in voice_map.items():
|
| 29 |
if line.startswith(prefix):
|
| 30 |
content = line[len(prefix):].strip()
|
| 31 |
chosen_voice = voice_name
|
| 32 |
break
|
| 33 |
-
# escape XML special chars
|
| 34 |
content_escaped = escape(content)
|
| 35 |
-
# add the voice-wrapped content and a short break
|
| 36 |
segments.append(f'<voice name="{chosen_voice}">{content_escaped}<break time="300ms"/></voice>')
|
| 37 |
|
| 38 |
ssml = '<?xml version="1.0" encoding="utf-8"?><speak version="1.0" xml:lang="fa-IR">' + "".join(segments) + '</speak>'
|
| 39 |
return ssml
|
| 40 |
|
| 41 |
-
def process(
|
| 42 |
-
text:
|
| 43 |
-
rate: int,
|
| 44 |
-
pitch: int,
|
| 45 |
-
volume: int,
|
| 46 |
-
) -> str:
|
| 47 |
-
"""
|
| 48 |
-
Convert text to speech supporting dialog lines like:
|
| 49 |
-
مرد: سلام
|
| 50 |
-
زن: خوبی؟
|
| 51 |
-
Produces a single mp3 file with voices switched via SSML <voice> tags.
|
| 52 |
-
"""
|
| 53 |
-
if not text or text.strip() == "":
|
| 54 |
gr.Warning("لطفا متنی وارد کنید.")
|
| 55 |
return ""
|
| 56 |
|
| 57 |
-
# create rate/pitch/volume strings (same logic as original)
|
| 58 |
-
rate_str = f"+{rate}%" if rate >= 0 else f"-{-rate}%"
|
| 59 |
-
pitch_str = f"+{pitch}Hz" if pitch >= 0 else f"-{-pitch}Hz"
|
| 60 |
-
volume_str = f"+{volume}%" if volume >= 0 else f"-{-volume}%"
|
| 61 |
-
|
| 62 |
-
# Build SSML: this will include per-line <voice name="..."> tags
|
| 63 |
-
# default voice if no prefix found is Dilara (female)
|
| 64 |
ssml_text = build_ssml_from_dialog(text, default_voice="fa-IR-DilaraNeural")
|
| 65 |
if not ssml_text:
|
| 66 |
gr.Warning("متن نامعتبر است.")
|
| 67 |
return ""
|
| 68 |
|
| 69 |
-
#
|
| 70 |
-
|
| 71 |
-
|
|
|
|
|
|
|
|
|
|
| 72 |
first_voice = "fa-IR-DilaraNeural"
|
| 73 |
-
if
|
| 74 |
first_voice = "fa-IR-FaridNeural"
|
| 75 |
|
| 76 |
communicate = edge_tts.Communicate(
|
| 77 |
text=ssml_text,
|
| 78 |
-
rate=rate_str,
|
| 79 |
-
pitch=pitch_str,
|
| 80 |
voice=first_voice,
|
| 81 |
-
|
|
|
|
|
|
|
| 82 |
)
|
| 83 |
|
| 84 |
-
|
| 85 |
-
|
| 86 |
-
|
| 87 |
|
| 88 |
-
#
|
| 89 |
-
communicate.save_sync(audio_fname=
|
| 90 |
|
| 91 |
-
return
|
| 92 |
|
| 93 |
-
def main()
|
| 94 |
-
os.system(
|
| 95 |
|
| 96 |
-
|
| 97 |
lines=8,
|
| 98 |
rtl=True,
|
| 99 |
-
label="متن (
|
| 100 |
placeholder="مثال:\nمرد: سلام\nزن: سلام، خوبی؟\nمرد: خوبم، ممنون.",
|
| 101 |
)
|
| 102 |
|
| 103 |
-
rate_slider = gr.Slider(
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
value=0,
|
| 107 |
-
minimum=-100,
|
| 108 |
-
maximum=100,
|
| 109 |
-
)
|
| 110 |
-
|
| 111 |
-
pitch_slider = gr.Slider(
|
| 112 |
-
label="فرکانس",
|
| 113 |
-
step=1,
|
| 114 |
-
value=0,
|
| 115 |
-
minimum=-100,
|
| 116 |
-
maximum=100,
|
| 117 |
-
)
|
| 118 |
-
|
| 119 |
-
volume_slider = gr.Slider(
|
| 120 |
-
label="حجم صدا",
|
| 121 |
-
step=1,
|
| 122 |
-
value=0,
|
| 123 |
-
minimum=-100,
|
| 124 |
-
maximum=100,
|
| 125 |
-
)
|
| 126 |
-
|
| 127 |
-
result_audio = gr.Audio(
|
| 128 |
-
label="نتیجه",
|
| 129 |
-
)
|
| 130 |
|
| 131 |
-
|
| 132 |
-
text_textarea,
|
| 133 |
-
rate_slider,
|
| 134 |
-
pitch_slider,
|
| 135 |
-
volume_slider,
|
| 136 |
-
]
|
| 137 |
-
|
| 138 |
-
outputs = [
|
| 139 |
-
result_audio,
|
| 140 |
-
]
|
| 141 |
|
| 142 |
interface = gr.Interface(
|
| 143 |
fn=process,
|
| 144 |
-
inputs=
|
| 145 |
-
outputs=
|
| 146 |
-
flagging_mode="never",
|
| 147 |
title="DT TTS (dialog support)",
|
| 148 |
description="پشتیبانی از خطوط دیالوگ با برچسب 'مرد:' و 'زن:' — هر خط با صدای مربوط تولید میشود.",
|
|
|
|
| 149 |
)
|
| 150 |
|
| 151 |
interface.launch()
|
|
|
|
| 1 |
+
# app.py - fixed version for dialog support with edge-tts SSML
|
| 2 |
import os
|
| 3 |
import tempfile
|
| 4 |
import edge_tts
|
|
|
|
| 7 |
|
| 8 |
def build_ssml_from_dialog(text: str, default_voice: str = "fa-IR-DilaraNeural") -> str:
|
| 9 |
"""
|
| 10 |
+
Parses input line by line, detects 'مرد:' or 'زن:' prefixes,
|
| 11 |
+
and wraps each segment in <voice> tags with a short break.
|
|
|
|
| 12 |
"""
|
| 13 |
+
lines = [l.strip() for l in text.splitlines() if l.strip()]
|
| 14 |
if not lines:
|
| 15 |
return ""
|
| 16 |
|
|
|
|
| 23 |
for line in lines:
|
| 24 |
chosen_voice = default_voice
|
| 25 |
content = line
|
|
|
|
| 26 |
for prefix, voice_name in voice_map.items():
|
| 27 |
if line.startswith(prefix):
|
| 28 |
content = line[len(prefix):].strip()
|
| 29 |
chosen_voice = voice_name
|
| 30 |
break
|
|
|
|
| 31 |
content_escaped = escape(content)
|
|
|
|
| 32 |
segments.append(f'<voice name="{chosen_voice}">{content_escaped}<break time="300ms"/></voice>')
|
| 33 |
|
| 34 |
ssml = '<?xml version="1.0" encoding="utf-8"?><speak version="1.0" xml:lang="fa-IR">' + "".join(segments) + '</speak>'
|
| 35 |
return ssml
|
| 36 |
|
| 37 |
+
def process(text: str, rate: int, pitch: int, volume: int) -> str:
|
| 38 |
+
if not text.strip():
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 39 |
gr.Warning("لطفا متنی وارد کنید.")
|
| 40 |
return ""
|
| 41 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 42 |
ssml_text = build_ssml_from_dialog(text, default_voice="fa-IR-DilaraNeural")
|
| 43 |
if not ssml_text:
|
| 44 |
gr.Warning("متن نامعتبر است.")
|
| 45 |
return ""
|
| 46 |
|
| 47 |
+
# Convert sliders to strings only if needed
|
| 48 |
+
rate_str = f"{rate}%" if rate != 0 else None
|
| 49 |
+
pitch_str = f"{pitch}Hz" if pitch != 0 else None
|
| 50 |
+
volume_str = f"{volume}%" if volume != 0 else None
|
| 51 |
+
|
| 52 |
+
# Select first voice appearing in SSML
|
| 53 |
first_voice = "fa-IR-DilaraNeural"
|
| 54 |
+
if "fa-IR-FaridNeural" in ssml_text:
|
| 55 |
first_voice = "fa-IR-FaridNeural"
|
| 56 |
|
| 57 |
communicate = edge_tts.Communicate(
|
| 58 |
text=ssml_text,
|
|
|
|
|
|
|
| 59 |
voice=first_voice,
|
| 60 |
+
rate=rate_str or "+0%",
|
| 61 |
+
pitch=pitch_str or "+0Hz",
|
| 62 |
+
volume=volume_str or "+0%",
|
| 63 |
)
|
| 64 |
|
| 65 |
+
temp_file = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
|
| 66 |
+
temp_name = temp_file.name
|
| 67 |
+
temp_file.close()
|
| 68 |
|
| 69 |
+
# Important: ssml=True to parse voice tags
|
| 70 |
+
communicate.save_sync(audio_fname=temp_name, ssml=True)
|
| 71 |
|
| 72 |
+
return temp_name
|
| 73 |
|
| 74 |
+
def main():
|
| 75 |
+
os.system("cls" if os.name == "nt" else "clear")
|
| 76 |
|
| 77 |
+
text_area = gr.TextArea(
|
| 78 |
lines=8,
|
| 79 |
rtl=True,
|
| 80 |
+
label="متن (از 'مرد:' و 'زن:' برای دیالوگ استفاده کنید)",
|
| 81 |
placeholder="مثال:\nمرد: سلام\nزن: سلام، خوبی؟\nمرد: خوبم، ممنون.",
|
| 82 |
)
|
| 83 |
|
| 84 |
+
rate_slider = gr.Slider(label="سرعت", step=1, value=0, minimum=-100, maximum=100)
|
| 85 |
+
pitch_slider = gr.Slider(label="فرکانس", step=1, value=0, minimum=-100, maximum=100)
|
| 86 |
+
volume_slider = gr.Slider(label="حجم صدا", step=1, value=0, minimum=-100, maximum=100)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 87 |
|
| 88 |
+
result_audio = gr.Audio(label="نتیجه")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 89 |
|
| 90 |
interface = gr.Interface(
|
| 91 |
fn=process,
|
| 92 |
+
inputs=[text_area, rate_slider, pitch_slider, volume_slider],
|
| 93 |
+
outputs=[result_audio],
|
|
|
|
| 94 |
title="DT TTS (dialog support)",
|
| 95 |
description="پشتیبانی از خطوط دیالوگ با برچسب 'مرد:' و 'زن:' — هر خط با صدای مربوط تولید میشود.",
|
| 96 |
+
flagging_mode="never",
|
| 97 |
)
|
| 98 |
|
| 99 |
interface.launch()
|