Michael Hu commited on
Commit
8829e6c
·
1 Parent(s): 6c4b49c

feat: add Kokoro-82M TTS model support

Browse files

- Add Kokoro-82M TTS model support to the app
- Update README to mention Kokoro model
- Add Kokoro-82M to the list of supported models
- Add Kokoro-82M to the list of supported models in the app

Files changed (3) hide show
  1. README.md +1 -0
  2. app.py +89 -0
  3. requirements.txt +2 -1
README.md CHANGED
@@ -48,6 +48,7 @@ This demo showcases the multilingual capabilities of multiple TTS models, suppor
48
  - **KittenTTS**: High-quality TTS with voice cloning capabilities
49
  - **Piper**: Local on-device TTS with multiple voice options
50
  - **Faster Whisper**: High-performance speech recognition model for audio transcription
 
51
 
52
  ## Examples
53
 
 
48
  - **KittenTTS**: High-quality TTS with voice cloning capabilities
49
  - **Piper**: Local on-device TTS with multiple voice options
50
  - **Faster Whisper**: High-performance speech recognition model for audio transcription
51
+ - **Kokoro**: Lightweight TTS model with 82M parameters, Apache-licensed for production and personal use
52
 
53
  ## Examples
54
 
app.py CHANGED
@@ -15,6 +15,7 @@ import soundfile as sf
15
  import wave
16
  import os
17
  from faster_whisper import WhisperModel
 
18
 
19
  # Model descriptions for better understanding
20
  MODEL_DESCRIPTIONS = {
@@ -22,6 +23,7 @@ MODEL_DESCRIPTIONS = {
22
  "KittenML/KittenTTS": "High-quality TTS with voice cloning capabilities using reference audio",
23
  "piper-tts": "Local on-device TTS with dynamic English and Chinese voice selection from Piper models",
24
  "SYSTRAN/faster-whisper": "Faster Whisper transcription with CTranslate2, up to 4x faster than OpenAI Whisper",
 
25
  }
26
 
27
  # Models dictionary
@@ -30,6 +32,7 @@ MODELS = {
30
  "KittenML/KittenTTS": "KittenTTS",
31
  "piper-tts": "Piper (no voice cloning)",
32
  "SYSTRAN/faster-whisper": "Faster Whisper",
 
33
  }
34
 
35
  original_torch_load = torch.load
@@ -90,6 +93,17 @@ voices_by_lang = scan_piper_voices()
90
 
91
  # No global piper_voice, load dynamically
92
 
 
 
 
 
 
 
 
 
 
 
 
93
  # Initialize faster-whisper model
94
  def initialize_faster_whisper():
95
  """Initialize the faster-whisper model with appropriate compute settings"""
@@ -184,6 +198,43 @@ def generate_kittentts_speech(text, audio_prompt=None):
184
  sf.write(tmp_file.name, wav, 24000)
185
  return tmp_file.name
186
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
  def generate_piper_speech(text, lang, voice):
188
  """
189
  Generate speech from text using Piper TTS with selected voice
@@ -394,6 +445,37 @@ with gr.Blocks(css=custom_css, title="🎙️ TTS Model Gallery", theme=gr.theme
394
  interactive=False
395
  )
396
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
397
  # Examples for Chatterbox
398
  gr.Examples(
399
  examples=[
@@ -436,6 +518,13 @@ with gr.Blocks(css=custom_css, title="🎙️ TTS Model Gallery", theme=gr.theme
436
  outputs=[whisper_text_output, whisper_status]
437
  )
438
 
 
 
 
 
 
 
 
439
  # Update voice dropdown when language changes
440
  piper_language_selection.change(
441
  fn=update_piper_voices,
 
15
  import wave
16
  import os
17
  from faster_whisper import WhisperModel
18
+ from kokoro import KPipeline
19
 
20
  # Model descriptions for better understanding
21
  MODEL_DESCRIPTIONS = {
 
23
  "KittenML/KittenTTS": "High-quality TTS with voice cloning capabilities using reference audio",
24
  "piper-tts": "Local on-device TTS with dynamic English and Chinese voice selection from Piper models",
25
  "SYSTRAN/faster-whisper": "Faster Whisper transcription with CTranslate2, up to 4x faster than OpenAI Whisper",
26
+ "hexgrad/kokoro": "Lightweight TTS model with 82M parameters, Apache-licensed for production and personal use",
27
  }
28
 
29
  # Models dictionary
 
32
  "KittenML/KittenTTS": "KittenTTS",
33
  "piper-tts": "Piper (no voice cloning)",
34
  "SYSTRAN/faster-whisper": "Faster Whisper",
35
+ "hexgrad/kokoro": "Kokoro-82M",
36
  }
37
 
38
  original_torch_load = torch.load
 
93
 
94
  # No global piper_voice, load dynamically
95
 
96
+ # Initialize Kokoro
97
+ def initialize_kokoro():
98
+ try:
99
+ # Initialize Kokoro pipeline with American English as default
100
+ kokoro_pipeline = KPipeline(lang_code='a')
101
+ print("Loaded Kokoro-82M pipeline with American English")
102
+ return kokoro_pipeline
103
+ except Exception as e:
104
+ print(f"Error loading Kokoro pipeline: {e}")
105
+ return None
106
+
107
  # Initialize faster-whisper model
108
  def initialize_faster_whisper():
109
  """Initialize the faster-whisper model with appropriate compute settings"""
 
198
  sf.write(tmp_file.name, wav, 24000)
199
  return tmp_file.name
200
 
201
+ def generate_kokoro_speech(text, language_code, voice_name):
202
+ """
203
+ Generate speech from text using Kokoro TTS with selected voice
204
+
205
+ Args:
206
+ text (str): Text to convert to speech
207
+ language_code (str): Language code ('a' for American English, etc.)
208
+ voice_name (str): Selected voice name
209
+
210
+ Returns:
211
+ tuple: (audio_path, error_msg) - path if success, None and error if fail
212
+ """
213
+ if not text.strip():
214
+ return None, "Please enter text to synthesize."
215
+
216
+ try:
217
+ # Initialize Kokoro pipeline with the selected language code
218
+ kokoro_pipeline = KPipeline(lang_code=language_code)
219
+
220
+ # Generate speech
221
+ audio_chunks = []
222
+ for _, _, audio in kokoro_pipeline(text, voice=voice_name):
223
+ audio_chunks.append(audio)
224
+
225
+ # If we have multiple chunks, concatenate them
226
+ if len(audio_chunks) > 1:
227
+ final_audio = np.concatenate(audio_chunks)
228
+ else:
229
+ final_audio = audio_chunks[0] if audio_chunks else np.array([])
230
+
231
+ # Save to a temporary file
232
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
233
+ sf.write(tmp_file.name, final_audio, 24000) # Kokoro uses 24kHz sample rate
234
+ return tmp_file.name, ""
235
+ except Exception as e:
236
+ return None, f"Error synthesizing speech: {str(e)}"
237
+
238
  def generate_piper_speech(text, lang, voice):
239
  """
240
  Generate speech from text using Piper TTS with selected voice
 
445
  interactive=False
446
  )
447
 
448
+ # Kokoro section
449
+ kokoro_model_info = gr.HTML(create_model_card("hexgrad/kokoro"))
450
+
451
+ with gr.Row():
452
+ with gr.Column():
453
+ kokoro_language_code = gr.Dropdown(
454
+ choices=[
455
+ ("American English", "a"),
456
+ ("British English", "b"),
457
+ ("Spanish", "e"),
458
+ ("French", "f"),
459
+ ("Hindi", "h"),
460
+ ("Italian", "i"),
461
+ ("Japanese", "j"),
462
+ ("Brazilian Portuguese", "p"),
463
+ ("Mandarin Chinese", "z")
464
+ ],
465
+ value="a",
466
+ label="Language"
467
+ )
468
+ kokoro_voice = gr.Dropdown(
469
+ choices=["af_heart", "af_sun", "af_moon", "af_star", "af_cloud"],
470
+ value="af_heart",
471
+ label="Voice"
472
+ )
473
+ kokoro_generate_btn = gr.Button("Generate Speech")
474
+
475
+ with gr.Column():
476
+ kokoro_audio_output = gr.Audio(label="Generated Speech", type="filepath")
477
+ kokoro_status = gr.Textbox(label="Status", interactive=False)
478
+
479
  # Examples for Chatterbox
480
  gr.Examples(
481
  examples=[
 
518
  outputs=[whisper_text_output, whisper_status]
519
  )
520
 
521
+ # Connect the Kokoro UI components to the generation function
522
+ kokoro_generate_btn.click(
523
+ fn=generate_kokoro_speech,
524
+ inputs=[text_input, kokoro_language_code, kokoro_voice],
525
+ outputs=[kokoro_audio_output, kokoro_status]
526
+ )
527
+
528
  # Update voice dropdown when language changes
529
  piper_language_selection.change(
530
  fn=update_piper_voices,
requirements.txt CHANGED
@@ -8,4 +8,5 @@ piper-tts
8
  transformers
9
  accelerate
10
  faster-whisper
11
- librosa
 
 
8
  transformers
9
  accelerate
10
  faster-whisper
11
+ librosa
12
+ kokoro==0.7.16