Rezfars commited on
Commit
6f371a4
·
verified ·
1 Parent(s): 7f66653

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -85
app.py CHANGED
@@ -1,4 +1,4 @@
1
- # app.py - modified to support dialog lines "مرد:" and "زن:"
2
  import os
3
  import tempfile
4
  import edge_tts
@@ -7,11 +7,10 @@ from xml.sax.saxutils import escape
7
 
8
  def build_ssml_from_dialog(text: str, default_voice: str = "fa-IR-DilaraNeural") -> str:
9
  """
10
- Parses the input text line-by-line. Each line starting with
11
- 'مرد:' or 'زن:' selects the corresponding voice. Other lines use default_voice.
12
- Returns an SSML string that wraps each segment in a <voice> tag and adds short breaks.
13
  """
14
- lines = [l.strip() for l in text.splitlines() if l.strip() != ""]
15
  if not lines:
16
  return ""
17
 
@@ -24,128 +23,77 @@ def build_ssml_from_dialog(text: str, default_voice: str = "fa-IR-DilaraNeural")
24
  for line in lines:
25
  chosen_voice = default_voice
26
  content = line
27
- # check explicit prefix (case-sensitive for Persian labels)
28
  for prefix, voice_name in voice_map.items():
29
  if line.startswith(prefix):
30
  content = line[len(prefix):].strip()
31
  chosen_voice = voice_name
32
  break
33
- # escape XML special chars
34
  content_escaped = escape(content)
35
- # add the voice-wrapped content and a short break
36
  segments.append(f'<voice name="{chosen_voice}">{content_escaped}<break time="300ms"/></voice>')
37
 
38
  ssml = '<?xml version="1.0" encoding="utf-8"?><speak version="1.0" xml:lang="fa-IR">' + "".join(segments) + '</speak>'
39
  return ssml
40
 
41
- def process(
42
- text: str,
43
- rate: int,
44
- pitch: int,
45
- volume: int,
46
- ) -> str:
47
- """
48
- Convert text to speech supporting dialog lines like:
49
- مرد: سلام
50
- زن: خوبی؟
51
- Produces a single mp3 file with voices switched via SSML <voice> tags.
52
- """
53
- if not text or text.strip() == "":
54
  gr.Warning("لطفا متنی وارد کنید.")
55
  return ""
56
 
57
- # create rate/pitch/volume strings (same logic as original)
58
- rate_str = f"+{rate}%" if rate >= 0 else f"-{-rate}%"
59
- pitch_str = f"+{pitch}Hz" if pitch >= 0 else f"-{-pitch}Hz"
60
- volume_str = f"+{volume}%" if volume >= 0 else f"-{-volume}%"
61
-
62
- # Build SSML: this will include per-line <voice name="..."> tags
63
- # default voice if no prefix found is Dilara (female)
64
  ssml_text = build_ssml_from_dialog(text, default_voice="fa-IR-DilaraNeural")
65
  if not ssml_text:
66
  gr.Warning("متن نامعتبر است.")
67
  return ""
68
 
69
- # Use edge-tts Communicate with a default voice (SSML contains explicit voices)
70
- # We'll pass the first voice encountered as the 'voice' parameter to be safe
71
- # (edge-tts requires a voice param); extract first voice from SSML segments
 
 
 
72
  first_voice = "fa-IR-DilaraNeural"
73
- if 'fa-IR-FaridNeural' in ssml_text:
74
  first_voice = "fa-IR-FaridNeural"
75
 
76
  communicate = edge_tts.Communicate(
77
  text=ssml_text,
78
- rate=rate_str,
79
- pitch=pitch_str,
80
  voice=first_voice,
81
- volume=volume_str,
 
 
82
  )
83
 
84
- temp_output_file = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
85
- temp_output_file_name = temp_output_file.name
86
- temp_output_file.close()
87
 
88
- # save_sync will accept SSML if provided as text
89
- communicate.save_sync(audio_fname=temp_output_file_name)
90
 
91
- return temp_output_file_name
92
 
93
- def main() -> None:
94
- os.system(command="cls" if os.name == "nt" else "clear")
95
 
96
- text_textarea = gr.TextArea(
97
  lines=8,
98
  rtl=True,
99
- label="متن (برای دیالوگ از 'مرد:' و 'زن:' استفاده کنید)",
100
  placeholder="مثال:\nمرد: سلام\nزن: سلام، خوبی؟\nمرد: خوبم، ممنون.",
101
  )
102
 
103
- rate_slider = gr.Slider(
104
- label="سرعت",
105
- step=1,
106
- value=0,
107
- minimum=-100,
108
- maximum=100,
109
- )
110
-
111
- pitch_slider = gr.Slider(
112
- label="فرکانس",
113
- step=1,
114
- value=0,
115
- minimum=-100,
116
- maximum=100,
117
- )
118
-
119
- volume_slider = gr.Slider(
120
- label="حجم صدا",
121
- step=1,
122
- value=0,
123
- minimum=-100,
124
- maximum=100,
125
- )
126
-
127
- result_audio = gr.Audio(
128
- label="نتیجه",
129
- )
130
 
131
- inputs = [
132
- text_textarea,
133
- rate_slider,
134
- pitch_slider,
135
- volume_slider,
136
- ]
137
-
138
- outputs = [
139
- result_audio,
140
- ]
141
 
142
  interface = gr.Interface(
143
  fn=process,
144
- inputs=inputs,
145
- outputs=outputs,
146
- flagging_mode="never",
147
  title="DT TTS (dialog support)",
148
  description="پشتیبانی از خطوط دیالوگ با برچسب 'مرد:' و 'زن:' — هر خط با صدای مربوط تولید می‌شود.",
 
149
  )
150
 
151
  interface.launch()
 
1
+ # app.py - fixed version for dialog support with edge-tts SSML
2
  import os
3
  import tempfile
4
  import edge_tts
 
7
 
8
  def build_ssml_from_dialog(text: str, default_voice: str = "fa-IR-DilaraNeural") -> str:
9
  """
10
+ Parses input line by line, detects 'مرد:' or 'زن:' prefixes,
11
+ and wraps each segment in <voice> tags with a short break.
 
12
  """
13
+ lines = [l.strip() for l in text.splitlines() if l.strip()]
14
  if not lines:
15
  return ""
16
 
 
23
  for line in lines:
24
  chosen_voice = default_voice
25
  content = line
 
26
  for prefix, voice_name in voice_map.items():
27
  if line.startswith(prefix):
28
  content = line[len(prefix):].strip()
29
  chosen_voice = voice_name
30
  break
 
31
  content_escaped = escape(content)
 
32
  segments.append(f'<voice name="{chosen_voice}">{content_escaped}<break time="300ms"/></voice>')
33
 
34
  ssml = '<?xml version="1.0" encoding="utf-8"?><speak version="1.0" xml:lang="fa-IR">' + "".join(segments) + '</speak>'
35
  return ssml
36
 
37
+ def process(text: str, rate: int, pitch: int, volume: int) -> str:
38
+ if not text.strip():
 
 
 
 
 
 
 
 
 
 
 
39
  gr.Warning("لطفا متنی وارد کنید.")
40
  return ""
41
 
 
 
 
 
 
 
 
42
  ssml_text = build_ssml_from_dialog(text, default_voice="fa-IR-DilaraNeural")
43
  if not ssml_text:
44
  gr.Warning("متن نامعتبر است.")
45
  return ""
46
 
47
+ # Convert sliders to strings only if needed
48
+ rate_str = f"{rate}%" if rate != 0 else None
49
+ pitch_str = f"{pitch}Hz" if pitch != 0 else None
50
+ volume_str = f"{volume}%" if volume != 0 else None
51
+
52
+ # Select first voice appearing in SSML
53
  first_voice = "fa-IR-DilaraNeural"
54
+ if "fa-IR-FaridNeural" in ssml_text:
55
  first_voice = "fa-IR-FaridNeural"
56
 
57
  communicate = edge_tts.Communicate(
58
  text=ssml_text,
 
 
59
  voice=first_voice,
60
+ rate=rate_str or "+0%",
61
+ pitch=pitch_str or "+0Hz",
62
+ volume=volume_str or "+0%",
63
  )
64
 
65
+ temp_file = tempfile.NamedTemporaryFile(suffix=".mp3", delete=False)
66
+ temp_name = temp_file.name
67
+ temp_file.close()
68
 
69
+ # Important: ssml=True to parse voice tags
70
+ communicate.save_sync(audio_fname=temp_name, ssml=True)
71
 
72
+ return temp_name
73
 
74
+ def main():
75
+ os.system("cls" if os.name == "nt" else "clear")
76
 
77
+ text_area = gr.TextArea(
78
  lines=8,
79
  rtl=True,
80
+ label="متن (از 'مرد:' و 'زن:' برای دیالوگ استفاده کنید)",
81
  placeholder="مثال:\nمرد: سلام\nزن: سلام، خوبی؟\nمرد: خوبم، ممنون.",
82
  )
83
 
84
+ rate_slider = gr.Slider(label="سرعت", step=1, value=0, minimum=-100, maximum=100)
85
+ pitch_slider = gr.Slider(label="فرکانس", step=1, value=0, minimum=-100, maximum=100)
86
+ volume_slider = gr.Slider(label="حجم صدا", step=1, value=0, minimum=-100, maximum=100)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
 
88
+ result_audio = gr.Audio(label="نتیجه")
 
 
 
 
 
 
 
 
 
89
 
90
  interface = gr.Interface(
91
  fn=process,
92
+ inputs=[text_area, rate_slider, pitch_slider, volume_slider],
93
+ outputs=[result_audio],
 
94
  title="DT TTS (dialog support)",
95
  description="پشتیبانی از خطوط دیالوگ با برچسب 'مرد:' و 'زن:' — هر خط با صدای مربوط تولید می‌شود.",
96
+ flagging_mode="never",
97
  )
98
 
99
  interface.launch()