Spaces:

Rezfars
/

Ttsnew1

Sleeping

App Files Files Community

Rezfars commited on Nov 10, 2025

Commit

6f371a4

verified ·

1 Parent(s): 7f66653

Update app.py

Browse files

Files changed (1) hide show

app.py +33 -85

app.py CHANGED Viewed

@@ -1,4 +1,4 @@
-# app.py - modified to support dialog lines "مرد:" and "زن:"
 import os
 import tempfile
 import edge_tts
@@ -7,11 +7,10 @@ from xml.sax.saxutils import escape
 def build_ssml_from_dialog(text: str, default_voice: str = "fa-IR-DilaraNeural") -> str:
     """
-    Parses the input text line-by-line. Each line starting with
-    'مرد:' or 'زن:' selects the corresponding voice. Other lines use default_voice.
-    Returns an SSML string that wraps each segment in a <voice> tag and adds short breaks.
     """
-    lines = [l.strip() for l in text.splitlines() if l.strip() != ""]
     if not lines:
         return ""
@@ -24,128 +23,77 @@ def build_ssml_from_dialog(text: str, default_voice: str = "fa-IR-DilaraNeural")
     for line in lines:
         chosen_voice = default_voice
         content = line
-        # check explicit prefix (case-sensitive for Persian labels)
         for prefix, voice_name in voice_map.items():
             if line.startswith(prefix):
                 content = line[len(prefix):].strip()
                 chosen_voice = voice_name
                 break
-        # escape XML special chars
         content_escaped = escape(content)
-        # add the voice-wrapped content and a short break
         segments.append(f'<voice name="{chosen_voice}">{content_escaped}<break time="300ms"/></voice>')
     ssml = '<?xml version="1.0" encoding="utf-8"?><speak version="1.0" xml:lang="fa-IR">' + "".join(segments) + '</speak>'
     return ssml
-def process(
-    text: str,
-    rate: int,
-    pitch: int,
-    volume: int,
-) -> str:
-    """
-    Convert text to speech supporting dialog lines like:
-    مرد: سلام
-    زن: خوبی؟
-    Produces a single mp3 file with voices switched via SSML <voice> tags.
-    """
-    if not text or text.strip() == "":
         gr.Warning("لطفا متنی وارد کنید.")
         return ""
-    # create rate/pitch/volume strings (same logic as original)
-    rate_str = f"+{rate}%" if rate >= 0 else f"-{-rate}%"
-    pitch_str = f"+{pitch}Hz" if pitch >= 0 else f"-{-pitch}Hz"
-    volume_str = f"+{volume}%" if volume >= 0 else f"-{-volume}%"
-    # Build SSML: this will include per-line <voice name="..."> tags
-    # default voice if no prefix found is Dilara (female)
     ssml_text = build_ssml_from_dialog(text, default_voice="fa-IR-DilaraNeural")
     if not ssml_text:
         gr.Warning("متن نامعتبر است.")
         return ""
-    # Use edge-tts Communicate with a default voice (SSML contains explicit voices)
-    # We'll pass the first voice encountered as the 'voice' parameter to be safe
-    # (edge-tts requires a voice param); extract first voice from SSML segments
     first_voice = "fa-IR-DilaraNeural"
-    if 'fa-IR-FaridNeural' in ssml_text:
         first_voice = "fa-IR-FaridNeural"
     communicate = edge_tts.Communicate(
         text=ssml_text,
-        rate=rate_str,
-        pitch=pitch_str,
         voice=first_voice,
-        volume=volume_str,
     )
-    temp_output_file = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
-    temp_output_file_name = temp_output_file.name
-    temp_output_file.close()
-    # save_sync will accept SSML if provided as text
-    communicate.save_sync(audio_fname=temp_output_file_name)
-    return temp_output_file_name
-def main() -> None:
-    os.system(command="cls" if os.name == "nt" else "clear")
-    text_textarea = gr.TextArea(
         lines=8,
         rtl=True,
-        label="متن (برای دیالوگ از 'مرد:' و 'زن:' استفاده کنید)",
         placeholder="مثال:\nمرد: سلام\nزن: سلام، خوبی؟\nمرد: خوبم، ممنون.",
     )
-    rate_slider = gr.Slider(
-        label="سرعت",
-        step=1,
-        value=0,
-        minimum=-100,
-        maximum=100,
-    )
-    pitch_slider = gr.Slider(
-        label="فرکانس",
-        step=1,
-        value=0,
-        minimum=-100,
-        maximum=100,
-    )
-    volume_slider = gr.Slider(
-        label="حجم صدا",
-        step=1,
-        value=0,
-        minimum=-100,
-        maximum=100,
-    )
-    result_audio = gr.Audio(
-        label="نتیجه",
-    )
-    inputs = [
-        text_textarea,
-        rate_slider,
-        pitch_slider,
-        volume_slider,
-    ]
-    outputs = [
-        result_audio,
-    ]
     interface = gr.Interface(
         fn=process,
-        inputs=inputs,
-        outputs=outputs,
-        flagging_mode="never",
         title="DT TTS (dialog support)",
         description="پشتیبانی از خطوط دیالوگ با برچسب 'مرد:' و 'زن:' — هر خط با صدای مربوط تولید می‌شود.",
     )
     interface.launch()

+# app.py - fixed version for dialog support with edge-tts SSML
 import os
 import tempfile
 import edge_tts
 def build_ssml_from_dialog(text: str, default_voice: str = "fa-IR-DilaraNeural") -> str:
     """
+    Parses input line by line, detects 'مرد:' or 'زن:' prefixes,
+    and wraps each segment in <voice> tags with a short break.
     """
+    lines = [l.strip() for l in text.splitlines() if l.strip()]
     if not lines:
         return ""
     for line in lines:
         chosen_voice = default_voice
         content = line
         for prefix, voice_name in voice_map.items():
             if line.startswith(prefix):
                 content = line[len(prefix):].strip()
                 chosen_voice = voice_name
                 break
         content_escaped = escape(content)
         segments.append(f'<voice name="{chosen_voice}">{content_escaped}<break time="300ms"/></voice>')
     ssml = '<?xml version="1.0" encoding="utf-8"?><speak version="1.0" xml:lang="fa-IR">' + "".join(segments) + '</speak>'
     return ssml
+def process(text: str, rate: int, pitch: int, volume: int) -> str:
+    if not text.strip():
         gr.Warning("لطفا متنی وارد کنید.")
         return ""
     ssml_text = build_ssml_from_dialog(text, default_voice="fa-IR-DilaraNeural")
     if not ssml_text:
         gr.Warning("متن نامعتبر است.")
         return ""
+    # Convert sliders to strings only if needed
+    rate_str = f"{rate}%" if rate != 0 else None
+    pitch_str = f"{pitch}Hz" if pitch != 0 else None
+    volume_str = f"{volume}%" if volume != 0 else None
+    # Select first voice appearing in SSML
     first_voice = "fa-IR-DilaraNeural"
+    if "fa-IR-FaridNeural" in ssml_text:
         first_voice = "fa-IR-FaridNeural"
     communicate = edge_tts.Communicate(
         text=ssml_text,
         voice=first_voice,
+        rate=rate_str or "+0%",
+        pitch=pitch_str or "+0Hz",
+        volume=volume_str or "+0%",
     )
+    temp_file = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
+    temp_name = temp_file.name
+    temp_file.close()
+    # Important: ssml=True to parse voice tags
+    communicate.save_sync(audio_fname=temp_name, ssml=True)
+    return temp_name
+def main():
+    os.system("cls" if os.name == "nt" else "clear")
+    text_area = gr.TextArea(
         lines=8,
         rtl=True,
+        label="متن (از 'مرد:' و 'زن:' برای دیالوگ استفاده کنید)",
         placeholder="مثال:\nمرد: سلام\nزن: سلام، خوبی؟\nمرد: خوبم، ممنون.",
     )
+    rate_slider = gr.Slider(label="سرعت", step=1, value=0, minimum=-100, maximum=100)
+    pitch_slider = gr.Slider(label="فرکانس", step=1, value=0, minimum=-100, maximum=100)
+    volume_slider = gr.Slider(label="حجم صدا", step=1, value=0, minimum=-100, maximum=100)
+    result_audio = gr.Audio(label="نتیجه")
     interface = gr.Interface(
         fn=process,
+        inputs=[text_area, rate_slider, pitch_slider, volume_slider],
+        outputs=[result_audio],
         title="DT TTS (dialog support)",
         description="پشتیبانی از خطوط دیالوگ با برچسب 'مرد:' و 'زن:' — هر خط با صدای مربوط تولید می‌شود.",
+        flagging_mode="never",
     )
     interface.launch()