Minte commited on
Commit
943a8da
Β·
1 Parent(s): 6d28d4b
Files changed (2) hide show
  1. app.py +63 -68
  2. requirements.txt +1 -3
app.py CHANGED
@@ -8,6 +8,7 @@ import io
8
  import soundfile as sf
9
  from datetime import datetime
10
  import os
 
11
 
12
  # Model configuration for each language
13
  MODELS = {
@@ -16,18 +17,7 @@ MODELS = {
16
  "Swahili": "facebook/mms-tts-swh",
17
  "Afan Oromo": "facebook/mms-tts-orm",
18
  "Tigrinya": "facebook/mms-tts-tir",
19
- # Note: Chichewa doesn't have a dedicated MMS-TTS model, using Swahili as fallback
20
- "Chichewa": "facebook/mms-tts-swh"
21
- }
22
-
23
- # Language codes for phonemizer
24
- LANGUAGE_CODES = {
25
- "Amharic": "am",
26
- "Somali": "so",
27
- "Swahili": "sw",
28
- "Afan Oromo": "om",
29
- "Tigrinya": "ti",
30
- "Chichewa": "ny" # Chichewa language code
31
  }
32
 
33
  class MMS_TTS_Service:
@@ -128,27 +118,12 @@ def text_to_speech(text, language, speed=1.0):
128
  return None, error
129
 
130
  sample_rate, waveform = result
131
- return (sample_rate, waveform), "βœ… Speech generated successfully!"
132
-
133
- def batch_tts(text_list, language, speed=1.0):
134
- """
135
- Batch processing multiple texts
136
- """
137
- results = []
138
- errors = []
139
 
140
- for i, text in enumerate(text_list):
141
- if text.strip():
142
- result, error = tts_service.generate_speech(text.strip(), language, speed)
143
- if error:
144
- errors.append(f"Text {i+1}: {error}")
145
- else:
146
- results.append((f"output_{i+1}.wav", result[0], result[1]))
147
-
148
- return results, errors
149
 
150
  def create_demo_audio(language):
151
- """Create demo audio for each language"""
152
  demo_texts = {
153
  "Amharic": "αˆ°αˆ‹αˆα£ α‹­αˆ… α‹¨α‹΅αˆα… αˆ›αˆ˜αŠ•αŒ« αˆžα‹΄αˆ αŠα‹α’",
154
  "Somali": "Salaam, kani waa modelka cod-sameynta.",
@@ -158,8 +133,7 @@ def create_demo_audio(language):
158
  "Chichewa": "Moni, iyi ndi modeli yopanga mawu."
159
  }
160
 
161
- demo_text = demo_texts.get(language, "Hello, this is a text-to-speech model.")
162
- return demo_text
163
 
164
  # Gradio interface
165
  with gr.Blocks(theme=gr.themes.Soft(), title="MMS Text-to-Speech") as demo:
@@ -216,22 +190,27 @@ with gr.Blocks(theme=gr.themes.Soft(), title="MMS Text-to-Speech") as demo:
216
  interactive=False,
217
  placeholder="Ready to generate speech..."
218
  )
219
-
220
- # Batch processing section
221
- gr.Markdown("### πŸ“š Batch Processing")
222
- batch_text = gr.Textbox(
223
- lines=4,
224
- placeholder="Enter multiple texts, one per line...",
225
- label="Batch Texts",
226
- info="Each line will be processed separately"
227
- )
228
- batch_btn = gr.Button("Process Batch")
229
- batch_output = gr.File(
230
- label="Batch Results",
231
- file_count="multiple",
232
- type="file"
233
- )
234
- batch_status = gr.Textbox(label="Batch Status")
 
 
 
 
 
235
 
236
  # Event handlers
237
  def generate_speech_handler(text, lang, spd):
@@ -240,33 +219,48 @@ with gr.Blocks(theme=gr.themes.Soft(), title="MMS Text-to-Speech") as demo:
240
  return text_to_speech(text, lang, spd)
241
 
242
  def clear_all():
243
- return "", "", None, "Cleared!"
244
 
245
  def load_demo(lang):
246
  return create_demo_audio(lang)
247
 
248
  def process_batch(texts, lang, spd):
 
249
  if not texts.strip():
250
- return [], "No texts provided."
251
 
252
  text_list = [t.strip() for t in texts.split('\n') if t.strip()]
253
- if len(text_list) > 10:
254
- return [], "Maximum 10 texts allowed for batch processing."
 
 
 
 
 
255
 
256
- results, errors = batch_tts(text_list, lang, spd)
 
257
 
258
- # Save results to files
259
- output_files = []
260
- for i, (filename, sample_rate, waveform) in enumerate(results):
261
- temp_file = f"/tmp/{filename}"
262
- sf.write(temp_file, waveform, sample_rate)
263
- output_files.append(temp_file)
 
 
 
 
 
264
 
265
- status_msg = f"Processed {len(results)} texts successfully."
266
- if errors:
267
- status_msg += f" Errors: {len(errors)}"
 
268
 
269
- return output_files, status_msg
 
 
270
 
271
  # Connect events
272
  generate_btn.click(
@@ -277,7 +271,7 @@ with gr.Blocks(theme=gr.themes.Soft(), title="MMS Text-to-Speech") as demo:
277
 
278
  clear_btn.click(
279
  fn=clear_all,
280
- outputs=[text_input, demo_output, audio_output, status]
281
  )
282
 
283
  demo_btn.click(
@@ -289,7 +283,7 @@ with gr.Blocks(theme=gr.themes.Soft(), title="MMS Text-to-Speech") as demo:
289
  batch_btn.click(
290
  fn=process_batch,
291
  inputs=[batch_text, language, speed],
292
- outputs=[batch_output, batch_status]
293
  )
294
 
295
  # Examples
@@ -299,7 +293,8 @@ with gr.Blocks(theme=gr.themes.Soft(), title="MMS Text-to-Speech") as demo:
299
  ["Somali", "Qof walba wuxuu leeyahay xuquuqda aadamaha."],
300
  ["Swahili", "Kila mtu ana haki zote za binadamu."],
301
  ["Afan Oromo", "Nama hundi mirga ummataa hundaa waliin dhalate."],
302
- ["Tigrinya", "αŠ©αˆ‰ ሰα‰₯ αŠ•αŠ©αˆ‰ αˆ˜αˆ°αˆ‹α‰΅ αŠ₯ኩል αŠ₯ዩፒ"]
 
303
  ]
304
 
305
  gr.Examples(
@@ -318,9 +313,9 @@ with gr.Blocks(theme=gr.themes.Soft(), title="MMS Text-to-Speech") as demo:
318
  **Powered by:** Facebook MMS-TTS Models
319
  **Supported Languages:** Amharic, Somali, Swahili, Afan Oromo, Tigrinya, Chichewa
320
  **Model Type:** Text-to-Speech
321
- **Max Text Length:** 500 characters
322
 
323
- For issues or questions, please check the model cards on Hugging Face.
324
  """
325
  )
326
 
 
8
  import soundfile as sf
9
  from datetime import datetime
10
  import os
11
+ import tempfile
12
 
13
  # Model configuration for each language
14
  MODELS = {
 
17
  "Swahili": "facebook/mms-tts-swh",
18
  "Afan Oromo": "facebook/mms-tts-orm",
19
  "Tigrinya": "facebook/mms-tts-tir",
20
+ "Chichewa": "facebook/mms-tts-swh" # Using Swahili as fallback
 
 
 
 
 
 
 
 
 
 
 
21
  }
22
 
23
  class MMS_TTS_Service:
 
118
  return None, error
119
 
120
  sample_rate, waveform = result
 
 
 
 
 
 
 
 
121
 
122
+ # Return as (sample_rate, audio_array) for gr.Audio
123
+ return (sample_rate, waveform), "βœ… Speech generated successfully!"
 
 
 
 
 
 
 
124
 
125
  def create_demo_audio(language):
126
+ """Create demo text for each language"""
127
  demo_texts = {
128
  "Amharic": "αˆ°αˆ‹αˆα£ α‹­αˆ… α‹¨α‹΅αˆα… αˆ›αˆ˜αŠ•αŒ« αˆžα‹΄αˆ αŠα‹α’",
129
  "Somali": "Salaam, kani waa modelka cod-sameynta.",
 
133
  "Chichewa": "Moni, iyi ndi modeli yopanga mawu."
134
  }
135
 
136
+ return demo_texts.get(language, "Hello, this is a text-to-speech model.")
 
137
 
138
  # Gradio interface
139
  with gr.Blocks(theme=gr.themes.Soft(), title="MMS Text-to-Speech") as demo:
 
190
  interactive=False,
191
  placeholder="Ready to generate speech..."
192
  )
193
+
194
+ # Batch processing section (simplified)
195
+ with gr.Accordion("πŸ“š Batch Processing (Advanced)", open=False):
196
+ gr.Markdown("Process multiple texts at once. Each line will be converted to a separate audio file.")
197
+
198
+ batch_text = gr.Textbox(
199
+ lines=4,
200
+ placeholder="Enter multiple texts, one per line...\nExample:\nHello\nHow are you?\nThank you",
201
+ label="Batch Texts",
202
+ info="Maximum 5 texts, each under 200 characters"
203
+ )
204
+
205
+ batch_btn = gr.Button("Process Batch Texts")
206
+ batch_status = gr.Textbox(label="Batch Processing Status")
207
+
208
+ # We'll use a gallery or multiple audio outputs for batch results
209
+ batch_results = gr.Gallery(
210
+ label="Batch Results",
211
+ show_label=True,
212
+ columns=2
213
+ )
214
 
215
  # Event handlers
216
  def generate_speech_handler(text, lang, spd):
 
219
  return text_to_speech(text, lang, spd)
220
 
221
  def clear_all():
222
+ return "", "", None, "Cleared!", "", None
223
 
224
  def load_demo(lang):
225
  return create_demo_audio(lang)
226
 
227
  def process_batch(texts, lang, spd):
228
+ """Process multiple texts and return file paths"""
229
  if not texts.strip():
230
+ return None, "No texts provided.", []
231
 
232
  text_list = [t.strip() for t in texts.split('\n') if t.strip()]
233
+ if len(text_list) > 5:
234
+ return None, "Maximum 5 texts allowed for batch processing.", []
235
+
236
+ # Validate each text
237
+ for i, text in enumerate(text_list):
238
+ if len(text) > 200:
239
+ return None, f"Text {i+1} is too long (max 200 characters).", []
240
 
241
+ results = []
242
+ error_count = 0
243
 
244
+ for i, text in enumerate(text_list):
245
+ result, error = tts_service.generate_speech(text, lang, spd)
246
+ if error:
247
+ error_count += 1
248
+ print(f"Error processing text {i+1}: {error}")
249
+ else:
250
+ sample_rate, waveform = result
251
+ # Create temporary file
252
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
253
+ sf.write(f.name, waveform, sample_rate)
254
+ results.append(f.name)
255
 
256
+ if error_count > 0:
257
+ status_msg = f"Processed {len(results)}/{len(text_list)} texts. {error_count} failed."
258
+ else:
259
+ status_msg = f"Successfully processed all {len(text_list)} texts!"
260
 
261
+ # Return first result as preview and all as files
262
+ preview_audio = (results[0] if results else None)
263
+ return preview_audio, status_msg, results
264
 
265
  # Connect events
266
  generate_btn.click(
 
271
 
272
  clear_btn.click(
273
  fn=clear_all,
274
+ outputs=[text_input, demo_output, audio_output, status, batch_text, batch_results]
275
  )
276
 
277
  demo_btn.click(
 
283
  batch_btn.click(
284
  fn=process_batch,
285
  inputs=[batch_text, language, speed],
286
+ outputs=[audio_output, batch_status, batch_results]
287
  )
288
 
289
  # Examples
 
293
  ["Somali", "Qof walba wuxuu leeyahay xuquuqda aadamaha."],
294
  ["Swahili", "Kila mtu ana haki zote za binadamu."],
295
  ["Afan Oromo", "Nama hundi mirga ummataa hundaa waliin dhalate."],
296
+ ["Tigrinya", "αŠ©αˆ‰ ሰα‰₯ αŠ•αŠ©αˆ‰ αˆ˜αˆ°αˆ‹α‰΅ αŠ₯ኩል αŠ₯ዩፒ"],
297
+ ["Chichewa", "Alipo wina aliyense ali ndi ufulu wachibadwidwe."]
298
  ]
299
 
300
  gr.Examples(
 
313
  **Powered by:** Facebook MMS-TTS Models
314
  **Supported Languages:** Amharic, Somali, Swahili, Afan Oromo, Tigrinya, Chichewa
315
  **Model Type:** Text-to-Speech
316
+ **Max Text Length:** 500 characters (single), 200 characters (batch)
317
 
318
+ Note: First request may take longer as models are downloaded.
319
  """
320
  )
321
 
requirements.txt CHANGED
@@ -4,6 +4,4 @@ torchaudio>=2.0.0
4
  transformers>=4.30.0
5
  gradio>=4.0.0
6
  numpy>=1.21.0
7
- librosa>=0.10.0
8
- soundfile>=0.12.0
9
- phonemizer>=3.0.0
 
4
  transformers>=4.30.0
5
  gradio>=4.0.0
6
  numpy>=1.21.0
7
+ soundfile>=0.12.0