Spaces:

minte-atnafu
/

GihonTech_Local_Language_TTS

Sleeping

App Files Files Community

Minte commited on Oct 8, 2025

Commit

6d28d4b

1 Parent(s): d5fb354

tts space

Browse files

Files changed (3) hide show

README.md +34 -0
app.py +344 -0
requirements.txt +9 -0

README.md CHANGED Viewed

@@ -12,3 +12,37 @@ short_description: This space used for text to speech
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+ Gradio web interface for Facebook's MMS-TTS models supporting multiple African languages.
+## 🌍 Supported Languages
+- **Amharic** (`facebook/mms-tts-amh`)
+- **Somali** (`facebook/mms-tts-som`)
+- **Swahili** (`facebook/mms-tts-swh`)
+- **Afan Oromo** (`facebook/mms-tts-orm`)
+- **Tigrinya** (`facebook/mms-tts-tir`)
+- **Chichewa** (using Swahili model as fallback)
+## 🚀 Features
+- Real-time text-to-speech conversion
+- Adjustable speech speed
+- Batch processing for multiple texts
+- Demo texts for each language
+- Mobile-friendly interface
+## 💻 Usage
+1. Select your target language
+2. Enter text (up to 500 characters)
+3. Adjust speed if desired
+4. Click "Generate Speech"
+5. Download or play the generated audio
+## 🔧 Technical Details
+- Built with Gradio for easy web interface
+- Uses Facebook's MMS-TTS transformer models
+- Supports GPU acceleration when available
+- Automatic model loading and caching

app.py ADDED Viewed

	@@ -0,0 +1,344 @@

+# app.py
+import gradio as gr
+import torch
+import torchaudio
+from transformers import VitsModel, AutoTokenizer
+import numpy as np
+import io
+import soundfile as sf
+from datetime import datetime
+import os
+# Model configuration for each language
+MODELS = {
+    "Amharic": "facebook/mms-tts-amh",
+    "Somali": "facebook/mms-tts-som",
+    "Swahili": "facebook/mms-tts-swh",
+    "Afan Oromo": "facebook/mms-tts-orm",
+    "Tigrinya": "facebook/mms-tts-tir",
+    # Note: Chichewa doesn't have a dedicated MMS-TTS model, using Swahili as fallback
+    "Chichewa": "facebook/mms-tts-swh"
+}
+# Language codes for phonemizer
+LANGUAGE_CODES = {
+    "Amharic": "am",
+    "Somali": "so",
+    "Swahili": "sw",
+    "Afan Oromo": "om",
+    "Tigrinya": "ti",
+    "Chichewa": "ny"  # Chichewa language code
+}
+class MMS_TTS_Service:
+    def __init__(self):
+        self.models = {}
+        self.tokenizers = {}
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        print(f"Using device: {self.device}")
+    def load_model(self, language):
+        """Load model for specific language"""
+        if language in self.models:
+            return self.models[language], self.tokenizers[language]
+        try:
+            model_name = MODELS[language]
+            print(f"Loading model for {language}: {model_name}")
+            # Load tokenizer and model
+            tokenizer = AutoTokenizer.from_pretrained(model_name)
+            model = VitsModel.from_pretrained(model_name)
+            model = model.to(self.device)
+            model.eval()
+            # Cache the loaded model
+            self.models[language] = model
+            self.tokenizers[language] = tokenizer
+            print(f"✅ Successfully loaded model for {language}")
+            return model, tokenizer
+        except Exception as e:
+            print(f"❌ Error loading model for {language}: {e}")
+            raise e
+    def generate_speech(self, text, language, speed=1.0):
+        """Generate speech from text for specified language"""
+        try:
+            # Load model if not already loaded
+            model, tokenizer = self.load_model(language)
+            # Tokenize input text
+            inputs = tokenizer(text, return_tensors="pt")
+            input_ids = inputs["input_ids"].to(self.device)
+            # Generate speech with torch.no_grad for efficiency
+            with torch.no_grad():
+                outputs = model(input_ids)
+                waveform = outputs.waveform[0].cpu().numpy()
+                sample_rate = model.config.sampling_rate
+            # Adjust speed if needed
+            if speed != 1.0:
+                waveform = self.adjust_speed(waveform, sample_rate, speed)
+            return (sample_rate, waveform), None
+        except Exception as e:
+            error_msg = f"Error generating speech: {str(e)}"
+            print(error_msg)
+            return None, error_msg
+    def adjust_speed(self, waveform, sample_rate, speed_factor):
+        """Adjust playback speed of audio"""
+        try:
+            # Simple resampling for speed adjustment
+            if speed_factor != 1.0:
+                new_length = int(len(waveform) / speed_factor)
+                indices = np.linspace(0, len(waveform) - 1, new_length)
+                waveform = np.interp(indices, np.arange(len(waveform)), waveform)
+            return waveform
+        except:
+            return waveform
+    def get_available_languages(self):
+        """Get list of available languages"""
+        return list(MODELS.keys())
+# Initialize TTS service
+tts_service = MMS_TTS_Service()
+def text_to_speech(text, language, speed=1.0):
+    """
+    Main function for Gradio interface
+    """
+    if not text.strip():
+        return None, "Please enter some text to convert to speech."
+    if len(text) > 500:
+        return None, "Text too long. Please keep it under 500 characters."
+    print(f"Generating speech for: '{text[:50]}...' in {language}")
+    # Generate speech
+    result, error = tts_service.generate_speech(text, language, speed)
+    if error:
+        return None, error
+    sample_rate, waveform = result
+    return (sample_rate, waveform), "✅ Speech generated successfully!"
+def batch_tts(text_list, language, speed=1.0):
+    """
+    Batch processing multiple texts
+    """
+    results = []
+    errors = []
+    for i, text in enumerate(text_list):
+        if text.strip():
+            result, error = tts_service.generate_speech(text.strip(), language, speed)
+            if error:
+                errors.append(f"Text {i+1}: {error}")
+            else:
+                results.append((f"output_{i+1}.wav", result[0], result[1]))
+    return results, errors
+def create_demo_audio(language):
+    """Create demo audio for each language"""
+    demo_texts = {
+        "Amharic": "ሰላም፣ ይህ የድምፅ ማመንጫ ሞዴል ነው።",
+        "Somali": "Salaam, kani waa modelka cod-sameynta.",
+        "Swahili": "Halo, hii ni modeli ya kutengeneza sauti.",
+        "Afan Oromo": "Akkam, kun modeli sagalee uumuudha.",
+        "Tigrinya": "ሰላም፣ እዚ ድምጺ ዝገብር ሞዴል እዩ።",
+        "Chichewa": "Moni, iyi ndi modeli yopanga mawu."
+    }
+    demo_text = demo_texts.get(language, "Hello, this is a text-to-speech model.")
+    return demo_text
+# Gradio interface
+with gr.Blocks(theme=gr.themes.Soft(), title="MMS Text-to-Speech") as demo:
+    gr.Markdown(
+        """
+        # 🎙️ MMS Text-to-Speech for African Languages
+        Convert text to natural speech in multiple African languages using Facebook's MMS-TTS models.
+        """
+    )
+    with gr.Row():
+        with gr.Column():
+            language = gr.Dropdown(
+                choices=tts_service.get_available_languages(),
+                value="Amharic",
+                label="Select Language",
+                info="Choose the language for speech generation"
+            )
+            text_input = gr.Textbox(
+                lines=3,
+                placeholder="Enter text to convert to speech...",
+                label="Input Text",
+                info="Maximum 500 characters"
+            )
+            speed = gr.Slider(
+                minimum=0.5,
+                maximum=2.0,
+                value=1.0,
+                step=0.1,
+                label="Speech Speed",
+                info="Adjust the playback speed"
+            )
+            with gr.Row():
+                generate_btn = gr.Button("Generate Speech", variant="primary")
+                clear_btn = gr.Button("Clear")
+            # Demo section
+            gr.Markdown("### 🎯 Quick Demo")
+            demo_btn = gr.Button("Load Demo Text")
+            demo_output = gr.Textbox(label="Demo Text", interactive=False)
+        with gr.Column():
+            audio_output = gr.Audio(
+                label="Generated Speech",
+                type="numpy",
+                interactive=False
+            )
+            status = gr.Textbox(
+                label="Status",
+                interactive=False,
+                placeholder="Ready to generate speech..."
+            )
+            # Batch processing section
+            gr.Markdown("### 📚 Batch Processing")
+            batch_text = gr.Textbox(
+                lines=4,
+                placeholder="Enter multiple texts, one per line...",
+                label="Batch Texts",
+                info="Each line will be processed separately"
+            )
+            batch_btn = gr.Button("Process Batch")
+            batch_output = gr.File(
+                label="Batch Results",
+                file_count="multiple",
+                type="file"
+            )
+            batch_status = gr.Textbox(label="Batch Status")
+    # Event handlers
+    def generate_speech_handler(text, lang, spd):
+        if not text.strip():
+            return None, "Please enter some text."
+        return text_to_speech(text, lang, spd)
+    def clear_all():
+        return "", "", None, "Cleared!"
+    def load_demo(lang):
+        return create_demo_audio(lang)
+    def process_batch(texts, lang, spd):
+        if not texts.strip():
+            return [], "No texts provided."
+        text_list = [t.strip() for t in texts.split('\n') if t.strip()]
+        if len(text_list) > 10:
+            return [], "Maximum 10 texts allowed for batch processing."
+        results, errors = batch_tts(text_list, lang, spd)
+        # Save results to files
+        output_files = []
+        for i, (filename, sample_rate, waveform) in enumerate(results):
+            temp_file = f"/tmp/{filename}"
+            sf.write(temp_file, waveform, sample_rate)
+            output_files.append(temp_file)
+        status_msg = f"Processed {len(results)} texts successfully."
+        if errors:
+            status_msg += f" Errors: {len(errors)}"
+        return output_files, status_msg
+    # Connect events
+    generate_btn.click(
+        fn=generate_speech_handler,
+        inputs=[text_input, language, speed],
+        outputs=[audio_output, status]
+    )
+    clear_btn.click(
+        fn=clear_all,
+        outputs=[text_input, demo_output, audio_output, status]
+    )
+    demo_btn.click(
+        fn=load_demo,
+        inputs=[language],
+        outputs=[demo_output]
+    )
+    batch_btn.click(
+        fn=process_batch,
+        inputs=[batch_text, language, speed],
+        outputs=[batch_output, batch_status]
+    )
+    # Examples
+    gr.Markdown("### 💡 Example Texts")
+    examples = [
+        ["Amharic", "ሁሉም ሰው በሁሉም መብቶች እኩል ነው።"],
+        ["Somali", "Qof walba wuxuu leeyahay xuquuqda aadamaha."],
+        ["Swahili", "Kila mtu ana haki zote za binadamu."],
+        ["Afan Oromo", "Nama hundi mirga ummataa hundaa waliin dhalate."],
+        ["Tigrinya", "ኩሉ ሰብ ንኩሉ መሰላት እኩል እዩ።"]
+    ]
+    gr.Examples(
+        examples=examples,
+        inputs=[language, text_input],
+        outputs=[audio_output, status],
+        fn=generate_speech_handler,
+        cache_examples=False
+    )
+    # Footer
+    gr.Markdown(
+        """
+        ---
+        ### ℹ️ About
+        **Powered by:** Facebook MMS-TTS Models
+        **Supported Languages:** Amharic, Somali, Swahili, Afan Oromo, Tigrinya, Chichewa
+        **Model Type:** Text-to-Speech
+        **Max Text Length:** 500 characters
+        For issues or questions, please check the model cards on Hugging Face.
+        """
+    )
+if __name__ == "__main__":
+    # Pre-load a model to reduce first-time latency
+    print("🚀 Starting MMS Text-to-Speech Service...")
+    print("📋 Supported Languages:", list(MODELS.keys()))
+    # Pre-load Amharic model for faster first response
+    try:
+        tts_service.load_model("Amharic")
+        print("✅ Pre-loaded Amharic model")
+    except Exception as e:
+        print("⚠️ Could not pre-load model:", e)
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860,
+        share=False,
+        show_error=True
+    )

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+# requirements.txt
+torch>=2.0.0
+torchaudio>=2.0.0
+transformers>=4.30.0
+gradio>=4.0.0
+numpy>=1.21.0
+librosa>=0.10.0
+soundfile>=0.12.0
+phonemizer>=3.0.0