Minte commited on
Commit
6d28d4b
Β·
1 Parent(s): d5fb354

tts space

Browse files
Files changed (3) hide show
  1. README.md +34 -0
  2. app.py +344 -0
  3. requirements.txt +9 -0
README.md CHANGED
@@ -12,3 +12,37 @@ short_description: This space used for text to speech
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  ---
13
 
14
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
15
+
16
+ Gradio web interface for Facebook's MMS-TTS models supporting multiple African languages.
17
+
18
+ ## 🌍 Supported Languages
19
+
20
+ - **Amharic** (`facebook/mms-tts-amh`)
21
+ - **Somali** (`facebook/mms-tts-som`)
22
+ - **Swahili** (`facebook/mms-tts-swh`)
23
+ - **Afan Oromo** (`facebook/mms-tts-orm`)
24
+ - **Tigrinya** (`facebook/mms-tts-tir`)
25
+ - **Chichewa** (using Swahili model as fallback)
26
+
27
+ ## πŸš€ Features
28
+
29
+ - Real-time text-to-speech conversion
30
+ - Adjustable speech speed
31
+ - Batch processing for multiple texts
32
+ - Demo texts for each language
33
+ - Mobile-friendly interface
34
+
35
+ ## πŸ’» Usage
36
+
37
+ 1. Select your target language
38
+ 2. Enter text (up to 500 characters)
39
+ 3. Adjust speed if desired
40
+ 4. Click "Generate Speech"
41
+ 5. Download or play the generated audio
42
+
43
+ ## πŸ”§ Technical Details
44
+
45
+ - Built with Gradio for easy web interface
46
+ - Uses Facebook's MMS-TTS transformer models
47
+ - Supports GPU acceleration when available
48
+ - Automatic model loading and caching
app.py ADDED
@@ -0,0 +1,344 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # app.py
2
+ import gradio as gr
3
+ import torch
4
+ import torchaudio
5
+ from transformers import VitsModel, AutoTokenizer
6
+ import numpy as np
7
+ import io
8
+ import soundfile as sf
9
+ from datetime import datetime
10
+ import os
11
+
12
+ # Model configuration for each language
13
+ MODELS = {
14
+ "Amharic": "facebook/mms-tts-amh",
15
+ "Somali": "facebook/mms-tts-som",
16
+ "Swahili": "facebook/mms-tts-swh",
17
+ "Afan Oromo": "facebook/mms-tts-orm",
18
+ "Tigrinya": "facebook/mms-tts-tir",
19
+ # Note: Chichewa doesn't have a dedicated MMS-TTS model, using Swahili as fallback
20
+ "Chichewa": "facebook/mms-tts-swh"
21
+ }
22
+
23
+ # Language codes for phonemizer
24
+ LANGUAGE_CODES = {
25
+ "Amharic": "am",
26
+ "Somali": "so",
27
+ "Swahili": "sw",
28
+ "Afan Oromo": "om",
29
+ "Tigrinya": "ti",
30
+ "Chichewa": "ny" # Chichewa language code
31
+ }
32
+
33
+ class MMS_TTS_Service:
34
+ def __init__(self):
35
+ self.models = {}
36
+ self.tokenizers = {}
37
+ self.device = "cuda" if torch.cuda.is_available() else "cpu"
38
+ print(f"Using device: {self.device}")
39
+
40
+ def load_model(self, language):
41
+ """Load model for specific language"""
42
+ if language in self.models:
43
+ return self.models[language], self.tokenizers[language]
44
+
45
+ try:
46
+ model_name = MODELS[language]
47
+ print(f"Loading model for {language}: {model_name}")
48
+
49
+ # Load tokenizer and model
50
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
51
+ model = VitsModel.from_pretrained(model_name)
52
+ model = model.to(self.device)
53
+ model.eval()
54
+
55
+ # Cache the loaded model
56
+ self.models[language] = model
57
+ self.tokenizers[language] = tokenizer
58
+
59
+ print(f"βœ… Successfully loaded model for {language}")
60
+ return model, tokenizer
61
+
62
+ except Exception as e:
63
+ print(f"❌ Error loading model for {language}: {e}")
64
+ raise e
65
+
66
+ def generate_speech(self, text, language, speed=1.0):
67
+ """Generate speech from text for specified language"""
68
+ try:
69
+ # Load model if not already loaded
70
+ model, tokenizer = self.load_model(language)
71
+
72
+ # Tokenize input text
73
+ inputs = tokenizer(text, return_tensors="pt")
74
+ input_ids = inputs["input_ids"].to(self.device)
75
+
76
+ # Generate speech with torch.no_grad for efficiency
77
+ with torch.no_grad():
78
+ outputs = model(input_ids)
79
+ waveform = outputs.waveform[0].cpu().numpy()
80
+ sample_rate = model.config.sampling_rate
81
+
82
+ # Adjust speed if needed
83
+ if speed != 1.0:
84
+ waveform = self.adjust_speed(waveform, sample_rate, speed)
85
+
86
+ return (sample_rate, waveform), None
87
+
88
+ except Exception as e:
89
+ error_msg = f"Error generating speech: {str(e)}"
90
+ print(error_msg)
91
+ return None, error_msg
92
+
93
+ def adjust_speed(self, waveform, sample_rate, speed_factor):
94
+ """Adjust playback speed of audio"""
95
+ try:
96
+ # Simple resampling for speed adjustment
97
+ if speed_factor != 1.0:
98
+ new_length = int(len(waveform) / speed_factor)
99
+ indices = np.linspace(0, len(waveform) - 1, new_length)
100
+ waveform = np.interp(indices, np.arange(len(waveform)), waveform)
101
+ return waveform
102
+ except:
103
+ return waveform
104
+
105
+ def get_available_languages(self):
106
+ """Get list of available languages"""
107
+ return list(MODELS.keys())
108
+
109
+ # Initialize TTS service
110
+ tts_service = MMS_TTS_Service()
111
+
112
+ def text_to_speech(text, language, speed=1.0):
113
+ """
114
+ Main function for Gradio interface
115
+ """
116
+ if not text.strip():
117
+ return None, "Please enter some text to convert to speech."
118
+
119
+ if len(text) > 500:
120
+ return None, "Text too long. Please keep it under 500 characters."
121
+
122
+ print(f"Generating speech for: '{text[:50]}...' in {language}")
123
+
124
+ # Generate speech
125
+ result, error = tts_service.generate_speech(text, language, speed)
126
+
127
+ if error:
128
+ return None, error
129
+
130
+ sample_rate, waveform = result
131
+ return (sample_rate, waveform), "βœ… Speech generated successfully!"
132
+
133
+ def batch_tts(text_list, language, speed=1.0):
134
+ """
135
+ Batch processing multiple texts
136
+ """
137
+ results = []
138
+ errors = []
139
+
140
+ for i, text in enumerate(text_list):
141
+ if text.strip():
142
+ result, error = tts_service.generate_speech(text.strip(), language, speed)
143
+ if error:
144
+ errors.append(f"Text {i+1}: {error}")
145
+ else:
146
+ results.append((f"output_{i+1}.wav", result[0], result[1]))
147
+
148
+ return results, errors
149
+
150
+ def create_demo_audio(language):
151
+ """Create demo audio for each language"""
152
+ demo_texts = {
153
+ "Amharic": "αˆ°αˆ‹αˆα£ α‹­αˆ… α‹¨α‹΅αˆα… αˆ›αˆ˜αŠ•αŒ« αˆžα‹΄αˆ αŠα‹α’",
154
+ "Somali": "Salaam, kani waa modelka cod-sameynta.",
155
+ "Swahili": "Halo, hii ni modeli ya kutengeneza sauti.",
156
+ "Afan Oromo": "Akkam, kun modeli sagalee uumuudha.",
157
+ "Tigrinya": "αˆ°αˆ‹αˆα£ αŠ₯α‹š α‹΅αˆαŒΊ α‹αŒˆα‰₯ር αˆžα‹΄αˆ αŠ₯ዩፒ",
158
+ "Chichewa": "Moni, iyi ndi modeli yopanga mawu."
159
+ }
160
+
161
+ demo_text = demo_texts.get(language, "Hello, this is a text-to-speech model.")
162
+ return demo_text
163
+
164
+ # Gradio interface
165
+ with gr.Blocks(theme=gr.themes.Soft(), title="MMS Text-to-Speech") as demo:
166
+ gr.Markdown(
167
+ """
168
+ # πŸŽ™οΈ MMS Text-to-Speech for African Languages
169
+ Convert text to natural speech in multiple African languages using Facebook's MMS-TTS models.
170
+ """
171
+ )
172
+
173
+ with gr.Row():
174
+ with gr.Column():
175
+ language = gr.Dropdown(
176
+ choices=tts_service.get_available_languages(),
177
+ value="Amharic",
178
+ label="Select Language",
179
+ info="Choose the language for speech generation"
180
+ )
181
+
182
+ text_input = gr.Textbox(
183
+ lines=3,
184
+ placeholder="Enter text to convert to speech...",
185
+ label="Input Text",
186
+ info="Maximum 500 characters"
187
+ )
188
+
189
+ speed = gr.Slider(
190
+ minimum=0.5,
191
+ maximum=2.0,
192
+ value=1.0,
193
+ step=0.1,
194
+ label="Speech Speed",
195
+ info="Adjust the playback speed"
196
+ )
197
+
198
+ with gr.Row():
199
+ generate_btn = gr.Button("Generate Speech", variant="primary")
200
+ clear_btn = gr.Button("Clear")
201
+
202
+ # Demo section
203
+ gr.Markdown("### 🎯 Quick Demo")
204
+ demo_btn = gr.Button("Load Demo Text")
205
+ demo_output = gr.Textbox(label="Demo Text", interactive=False)
206
+
207
+ with gr.Column():
208
+ audio_output = gr.Audio(
209
+ label="Generated Speech",
210
+ type="numpy",
211
+ interactive=False
212
+ )
213
+
214
+ status = gr.Textbox(
215
+ label="Status",
216
+ interactive=False,
217
+ placeholder="Ready to generate speech..."
218
+ )
219
+
220
+ # Batch processing section
221
+ gr.Markdown("### πŸ“š Batch Processing")
222
+ batch_text = gr.Textbox(
223
+ lines=4,
224
+ placeholder="Enter multiple texts, one per line...",
225
+ label="Batch Texts",
226
+ info="Each line will be processed separately"
227
+ )
228
+ batch_btn = gr.Button("Process Batch")
229
+ batch_output = gr.File(
230
+ label="Batch Results",
231
+ file_count="multiple",
232
+ type="file"
233
+ )
234
+ batch_status = gr.Textbox(label="Batch Status")
235
+
236
+ # Event handlers
237
+ def generate_speech_handler(text, lang, spd):
238
+ if not text.strip():
239
+ return None, "Please enter some text."
240
+ return text_to_speech(text, lang, spd)
241
+
242
+ def clear_all():
243
+ return "", "", None, "Cleared!"
244
+
245
+ def load_demo(lang):
246
+ return create_demo_audio(lang)
247
+
248
+ def process_batch(texts, lang, spd):
249
+ if not texts.strip():
250
+ return [], "No texts provided."
251
+
252
+ text_list = [t.strip() for t in texts.split('\n') if t.strip()]
253
+ if len(text_list) > 10:
254
+ return [], "Maximum 10 texts allowed for batch processing."
255
+
256
+ results, errors = batch_tts(text_list, lang, spd)
257
+
258
+ # Save results to files
259
+ output_files = []
260
+ for i, (filename, sample_rate, waveform) in enumerate(results):
261
+ temp_file = f"/tmp/{filename}"
262
+ sf.write(temp_file, waveform, sample_rate)
263
+ output_files.append(temp_file)
264
+
265
+ status_msg = f"Processed {len(results)} texts successfully."
266
+ if errors:
267
+ status_msg += f" Errors: {len(errors)}"
268
+
269
+ return output_files, status_msg
270
+
271
+ # Connect events
272
+ generate_btn.click(
273
+ fn=generate_speech_handler,
274
+ inputs=[text_input, language, speed],
275
+ outputs=[audio_output, status]
276
+ )
277
+
278
+ clear_btn.click(
279
+ fn=clear_all,
280
+ outputs=[text_input, demo_output, audio_output, status]
281
+ )
282
+
283
+ demo_btn.click(
284
+ fn=load_demo,
285
+ inputs=[language],
286
+ outputs=[demo_output]
287
+ )
288
+
289
+ batch_btn.click(
290
+ fn=process_batch,
291
+ inputs=[batch_text, language, speed],
292
+ outputs=[batch_output, batch_status]
293
+ )
294
+
295
+ # Examples
296
+ gr.Markdown("### πŸ’‘ Example Texts")
297
+ examples = [
298
+ ["Amharic", "αˆαˆ‰αˆ αˆ°α‹ α‰ αˆαˆ‰αˆ መα‰₯ቢች αŠ₯ኩል αŠα‹α’"],
299
+ ["Somali", "Qof walba wuxuu leeyahay xuquuqda aadamaha."],
300
+ ["Swahili", "Kila mtu ana haki zote za binadamu."],
301
+ ["Afan Oromo", "Nama hundi mirga ummataa hundaa waliin dhalate."],
302
+ ["Tigrinya", "αŠ©αˆ‰ ሰα‰₯ αŠ•αŠ©αˆ‰ αˆ˜αˆ°αˆ‹α‰΅ αŠ₯ኩል αŠ₯ዩፒ"]
303
+ ]
304
+
305
+ gr.Examples(
306
+ examples=examples,
307
+ inputs=[language, text_input],
308
+ outputs=[audio_output, status],
309
+ fn=generate_speech_handler,
310
+ cache_examples=False
311
+ )
312
+
313
+ # Footer
314
+ gr.Markdown(
315
+ """
316
+ ---
317
+ ### ℹ️ About
318
+ **Powered by:** Facebook MMS-TTS Models
319
+ **Supported Languages:** Amharic, Somali, Swahili, Afan Oromo, Tigrinya, Chichewa
320
+ **Model Type:** Text-to-Speech
321
+ **Max Text Length:** 500 characters
322
+
323
+ For issues or questions, please check the model cards on Hugging Face.
324
+ """
325
+ )
326
+
327
+ if __name__ == "__main__":
328
+ # Pre-load a model to reduce first-time latency
329
+ print("πŸš€ Starting MMS Text-to-Speech Service...")
330
+ print("πŸ“‹ Supported Languages:", list(MODELS.keys()))
331
+
332
+ # Pre-load Amharic model for faster first response
333
+ try:
334
+ tts_service.load_model("Amharic")
335
+ print("βœ… Pre-loaded Amharic model")
336
+ except Exception as e:
337
+ print("⚠️ Could not pre-load model:", e)
338
+
339
+ demo.launch(
340
+ server_name="0.0.0.0",
341
+ server_port=7860,
342
+ share=False,
343
+ show_error=True
344
+ )
requirements.txt ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ # requirements.txt
2
+ torch>=2.0.0
3
+ torchaudio>=2.0.0
4
+ transformers>=4.30.0
5
+ gradio>=4.0.0
6
+ numpy>=1.21.0
7
+ librosa>=0.10.0
8
+ soundfile>=0.12.0
9
+ phonemizer>=3.0.0