Spaces:

DroolingPanda
/

tts_gallery

Sleeping

Michael Hu commited on Sep 27, 2025

Commit

8829e6c

1 Parent(s): 6c4b49c

feat: add Kokoro-82M TTS model support

- Add Kokoro-82M TTS model support to the app
- Update README to mention Kokoro model
- Add Kokoro-82M to the list of supported models
- Add Kokoro-82M to the list of supported models in the app

Files changed (3) hide show

README.md +1 -0
app.py +89 -0
requirements.txt +2 -1

README.md CHANGED Viewed

@@ -48,6 +48,7 @@ This demo showcases the multilingual capabilities of multiple TTS models, suppor
 - **KittenTTS**: High-quality TTS with voice cloning capabilities
 - **Piper**: Local on-device TTS with multiple voice options
 - **Faster Whisper**: High-performance speech recognition model for audio transcription
 ## Examples

 - **KittenTTS**: High-quality TTS with voice cloning capabilities
 - **Piper**: Local on-device TTS with multiple voice options
 - **Faster Whisper**: High-performance speech recognition model for audio transcription
+- **Kokoro**: Lightweight TTS model with 82M parameters, Apache-licensed for production and personal use
 ## Examples

app.py CHANGED Viewed

@@ -15,6 +15,7 @@ import soundfile as sf
 import wave
 import os
 from faster_whisper import WhisperModel
 # Model descriptions for better understanding
 MODEL_DESCRIPTIONS = {
@@ -22,6 +23,7 @@ MODEL_DESCRIPTIONS = {
     "KittenML/KittenTTS": "High-quality TTS with voice cloning capabilities using reference audio",
     "piper-tts": "Local on-device TTS with dynamic English and Chinese voice selection from Piper models",
     "SYSTRAN/faster-whisper": "Faster Whisper transcription with CTranslate2, up to 4x faster than OpenAI Whisper",
 }
 # Models dictionary
@@ -30,6 +32,7 @@ MODELS = {
     "KittenML/KittenTTS": "KittenTTS",
     "piper-tts": "Piper (no voice cloning)",
     "SYSTRAN/faster-whisper": "Faster Whisper",
 }
 original_torch_load = torch.load
@@ -90,6 +93,17 @@ voices_by_lang = scan_piper_voices()
 # No global piper_voice, load dynamically
 # Initialize faster-whisper model
 def initialize_faster_whisper():
     """Initialize the faster-whisper model with appropriate compute settings"""
@@ -184,6 +198,43 @@ def generate_kittentts_speech(text, audio_prompt=None):
         sf.write(tmp_file.name, wav, 24000)
         return tmp_file.name
 def generate_piper_speech(text, lang, voice):
     """
     Generate speech from text using Piper TTS with selected voice
@@ -394,6 +445,37 @@ with gr.Blocks(css=custom_css, title="🎙️ TTS Model Gallery", theme=gr.theme
                 interactive=False
             )
     # Examples for Chatterbox
     gr.Examples(
         examples=[
@@ -436,6 +518,13 @@ with gr.Blocks(css=custom_css, title="🎙️ TTS Model Gallery", theme=gr.theme
         outputs=[whisper_text_output, whisper_status]
     )
     # Update voice dropdown when language changes
     piper_language_selection.change(
         fn=update_piper_voices,

 import wave
 import os
 from faster_whisper import WhisperModel
+from kokoro import KPipeline
 # Model descriptions for better understanding
 MODEL_DESCRIPTIONS = {
     "KittenML/KittenTTS": "High-quality TTS with voice cloning capabilities using reference audio",
     "piper-tts": "Local on-device TTS with dynamic English and Chinese voice selection from Piper models",
     "SYSTRAN/faster-whisper": "Faster Whisper transcription with CTranslate2, up to 4x faster than OpenAI Whisper",
+    "hexgrad/kokoro": "Lightweight TTS model with 82M parameters, Apache-licensed for production and personal use",
 }
 # Models dictionary
     "KittenML/KittenTTS": "KittenTTS",
     "piper-tts": "Piper (no voice cloning)",
     "SYSTRAN/faster-whisper": "Faster Whisper",
+    "hexgrad/kokoro": "Kokoro-82M",
 }
 original_torch_load = torch.load
 # No global piper_voice, load dynamically
+# Initialize Kokoro
+def initialize_kokoro():
+    try:
+        # Initialize Kokoro pipeline with American English as default
+        kokoro_pipeline = KPipeline(lang_code='a')
+        print("Loaded Kokoro-82M pipeline with American English")
+        return kokoro_pipeline
+    except Exception as e:
+        print(f"Error loading Kokoro pipeline: {e}")
+        return None
 # Initialize faster-whisper model
 def initialize_faster_whisper():
     """Initialize the faster-whisper model with appropriate compute settings"""
         sf.write(tmp_file.name, wav, 24000)
         return tmp_file.name
+def generate_kokoro_speech(text, language_code, voice_name):
+    """
+    Generate speech from text using Kokoro TTS with selected voice
+    Args:
+        text (str): Text to convert to speech
+        language_code (str): Language code ('a' for American English, etc.)
+        voice_name (str): Selected voice name
+    Returns:
+        tuple: (audio_path, error_msg) - path if success, None and error if fail
+    """
+    if not text.strip():
+        return None, "Please enter text to synthesize."
+    try:
+        # Initialize Kokoro pipeline with the selected language code
+        kokoro_pipeline = KPipeline(lang_code=language_code)
+        # Generate speech
+        audio_chunks = []
+        for _, _, audio in kokoro_pipeline(text, voice=voice_name):
+            audio_chunks.append(audio)
+        # If we have multiple chunks, concatenate them
+        if len(audio_chunks) > 1:
+            final_audio = np.concatenate(audio_chunks)
+        else:
+            final_audio = audio_chunks[0] if audio_chunks else np.array([])
+        # Save to a temporary file
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
+            sf.write(tmp_file.name, final_audio, 24000)  # Kokoro uses 24kHz sample rate
+            return tmp_file.name, ""
+    except Exception as e:
+        return None, f"Error synthesizing speech: {str(e)}"
 def generate_piper_speech(text, lang, voice):
     """
     Generate speech from text using Piper TTS with selected voice
                 interactive=False
             )
+    # Kokoro section
+    kokoro_model_info = gr.HTML(create_model_card("hexgrad/kokoro"))
+    with gr.Row():
+        with gr.Column():
+            kokoro_language_code = gr.Dropdown(
+                choices=[
+                    ("American English", "a"),
+                    ("British English", "b"),
+                    ("Spanish", "e"),
+                    ("French", "f"),
+                    ("Hindi", "h"),
+                    ("Italian", "i"),
+                    ("Japanese", "j"),
+                    ("Brazilian Portuguese", "p"),
+                    ("Mandarin Chinese", "z")
+                ],
+                value="a",
+                label="Language"
+            )
+            kokoro_voice = gr.Dropdown(
+                choices=["af_heart", "af_sun", "af_moon", "af_star", "af_cloud"],
+                value="af_heart",
+                label="Voice"
+            )
+            kokoro_generate_btn = gr.Button("Generate Speech")
+        with gr.Column():
+            kokoro_audio_output = gr.Audio(label="Generated Speech", type="filepath")
+            kokoro_status = gr.Textbox(label="Status", interactive=False)
     # Examples for Chatterbox
     gr.Examples(
         examples=[
         outputs=[whisper_text_output, whisper_status]
     )
+    # Connect the Kokoro UI components to the generation function
+    kokoro_generate_btn.click(
+        fn=generate_kokoro_speech,
+        inputs=[text_input, kokoro_language_code, kokoro_voice],
+        outputs=[kokoro_audio_output, kokoro_status]
+    )
     # Update voice dropdown when language changes
     piper_language_selection.change(
         fn=update_piper_voices,

requirements.txt CHANGED Viewed

@@ -8,4 +8,5 @@ piper-tts
 transformers
 accelerate
 faster-whisper
-librosa

 transformers
 accelerate
 faster-whisper
+librosa
+kokoro==0.7.16