Michael Hu commited on
Commit
6c4b49c
·
1 Parent(s): d77f8ff

implement faster whisper

Browse files
Files changed (3) hide show
  1. README.md +1 -0
  2. app.py +117 -1
  3. requirements.txt +3 -1
README.md CHANGED
@@ -47,6 +47,7 @@ This demo showcases the multilingual capabilities of multiple TTS models, suppor
47
  - **Chatterbox**: Industrial-grade multilingual TTS solution
48
  - **KittenTTS**: High-quality TTS with voice cloning capabilities
49
  - **Piper**: Local on-device TTS with multiple voice options
 
50
 
51
  ## Examples
52
 
 
47
  - **Chatterbox**: Industrial-grade multilingual TTS solution
48
  - **KittenTTS**: High-quality TTS with voice cloning capabilities
49
  - **Piper**: Local on-device TTS with multiple voice options
50
+ - **Faster Whisper**: High-performance speech recognition model for audio transcription
51
 
52
  ## Examples
53
 
app.py CHANGED
@@ -14,12 +14,14 @@ from transformers import AutoModelForSeq2SeqLM
14
  import soundfile as sf
15
  import wave
16
  import os
 
17
 
18
  # Model descriptions for better understanding
19
  MODEL_DESCRIPTIONS = {
20
  "ResembleAI/chatterbox": "Industrial-grade TTS solution with multilingual support",
21
  "KittenML/KittenTTS": "High-quality TTS with voice cloning capabilities using reference audio",
22
  "piper-tts": "Local on-device TTS with dynamic English and Chinese voice selection from Piper models",
 
23
  }
24
 
25
  # Models dictionary
@@ -27,6 +29,7 @@ MODELS = {
27
  "ResembleAI/chatterbox": "Chatterbox",
28
  "KittenML/KittenTTS": "KittenTTS",
29
  "piper-tts": "Piper (no voice cloning)",
 
30
  }
31
 
32
  original_torch_load = torch.load
@@ -87,6 +90,36 @@ voices_by_lang = scan_piper_voices()
87
 
88
  # No global piper_voice, load dynamically
89
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  def generate_chatterbox_speech(text, language, audio_prompt=None):
91
  """
92
  Generate speech from text using Chatterbox multilingual TTS with optional audio prompt
@@ -185,6 +218,50 @@ def update_piper_voices(lang):
185
  value = choices[0] if choices else None
186
  return gr.update(choices=choices, value=value)
187
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
188
  def create_model_card(repo: str) -> str:
189
  """Create a formatted model card with ratings and description."""
190
  display_name = MODELS[repo]
@@ -283,7 +360,39 @@ with gr.Blocks(css=custom_css, title="🎙️ TTS Model Gallery", theme=gr.theme
283
  piper_audio_output = gr.Audio(label="Generated Speech", type="filepath")
284
  piper_status = gr.Textbox(label="Status", interactive=False)
285
 
286
- # VibeVoice section removed
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
287
 
288
  # Examples for Chatterbox
289
  gr.Examples(
@@ -320,6 +429,13 @@ with gr.Blocks(css=custom_css, title="🎙️ TTS Model Gallery", theme=gr.theme
320
  outputs=[piper_audio_output, piper_status]
321
  )
322
 
 
 
 
 
 
 
 
323
  # Update voice dropdown when language changes
324
  piper_language_selection.change(
325
  fn=update_piper_voices,
 
14
  import soundfile as sf
15
  import wave
16
  import os
17
+ from faster_whisper import WhisperModel
18
 
19
  # Model descriptions for better understanding
20
  MODEL_DESCRIPTIONS = {
21
  "ResembleAI/chatterbox": "Industrial-grade TTS solution with multilingual support",
22
  "KittenML/KittenTTS": "High-quality TTS with voice cloning capabilities using reference audio",
23
  "piper-tts": "Local on-device TTS with dynamic English and Chinese voice selection from Piper models",
24
+ "SYSTRAN/faster-whisper": "Faster Whisper transcription with CTranslate2, up to 4x faster than OpenAI Whisper",
25
  }
26
 
27
  # Models dictionary
 
29
  "ResembleAI/chatterbox": "Chatterbox",
30
  "KittenML/KittenTTS": "KittenTTS",
31
  "piper-tts": "Piper (no voice cloning)",
32
+ "SYSTRAN/faster-whisper": "Faster Whisper",
33
  }
34
 
35
  original_torch_load = torch.load
 
90
 
91
  # No global piper_voice, load dynamically
92
 
93
+ # Initialize faster-whisper model
94
+ def initialize_faster_whisper():
95
+ """Initialize the faster-whisper model with appropriate compute settings"""
96
+ model_size = "large-v3"
97
+
98
+ try:
99
+ if torch.cuda.is_available():
100
+ whisper_model = WhisperModel(model_size, device="cuda", compute_type="float16")
101
+ print("Loaded faster-whisper on CUDA with FP16")
102
+ elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
103
+ # MPS (Apple Silicon) support
104
+ whisper_model = WhisperModel(model_size, device="cpu", compute_type="int8")
105
+ print("Loaded faster-whisper on CPU with INT8 (MPS not directly supported)")
106
+ else:
107
+ whisper_model = WhisperModel(model_size, device="cpu", compute_type="int8")
108
+ print("Loaded faster-whisper on CPU with INT8")
109
+
110
+ return whisper_model
111
+ except Exception as e:
112
+ print(f"Error loading faster-whisper model: {str(e)}")
113
+ print("Falling back to small model with INT8 quantization")
114
+ try:
115
+ return WhisperModel("small", device="cpu", compute_type="int8")
116
+ except Exception as e2:
117
+ print(f"Failed to load fallback model: {str(e2)}")
118
+ return None
119
+
120
+ # Initialize the model
121
+ whisper_model = initialize_faster_whisper()
122
+
123
  def generate_chatterbox_speech(text, language, audio_prompt=None):
124
  """
125
  Generate speech from text using Chatterbox multilingual TTS with optional audio prompt
 
218
  value = choices[0] if choices else None
219
  return gr.update(choices=choices, value=value)
220
 
221
+ def generate_faster_whisper_speech(audio_file, beam_size=5, language=None):
222
+ """
223
+ Transcribe speech from audio file using Faster Whisper
224
+
225
+ Args:
226
+ audio_file (str): Path to audio file for transcription
227
+ beam_size (int): Beam size for transcription (higher = more accurate but slower)
228
+ language (str, optional): Language code to force for transcription
229
+
230
+ Returns:
231
+ tuple: (transcription_text, error_msg) - text if success, empty and error if fail
232
+ """
233
+ if not audio_file or not os.path.exists(audio_file):
234
+ return "", "Please upload an audio file to transcribe."
235
+
236
+ if whisper_model is None:
237
+ return "", "Faster Whisper model failed to initialize."
238
+
239
+ try:
240
+ # Set up transcription parameters
241
+ transcribe_options = {
242
+ "beam_size": beam_size,
243
+ "language": language if language else None,
244
+ "task": "transcribe"
245
+ }
246
+
247
+ # Remove None values
248
+ transcribe_options = {k: v for k, v in transcribe_options.items() if v is not None}
249
+
250
+ # Perform transcription
251
+ segments, info = whisper_model.transcribe(audio_file, **transcribe_options)
252
+
253
+ # Collect all segments into a single text
254
+ result = ""
255
+ for segment in segments:
256
+ result += segment.text + " "
257
+
258
+ # Add language detection info
259
+ detected_info = f"\n\nDetected language: {info.language} (probability: {info.language_probability:.2f})"
260
+
261
+ return result.strip(), detected_info
262
+ except Exception as e:
263
+ return "", f"Error transcribing audio: {str(e)}"
264
+
265
  def create_model_card(repo: str) -> str:
266
  """Create a formatted model card with ratings and description."""
267
  display_name = MODELS[repo]
 
360
  piper_audio_output = gr.Audio(label="Generated Speech", type="filepath")
361
  piper_status = gr.Textbox(label="Status", interactive=False)
362
 
363
+ # Faster Whisper section
364
+ whisper_model_info = gr.HTML(create_model_card("SYSTRAN/faster-whisper"))
365
+
366
+ with gr.Row():
367
+ with gr.Column():
368
+ whisper_audio_input = gr.Audio(
369
+ label="Upload Audio for Transcription",
370
+ type="filepath"
371
+ )
372
+ whisper_beam_size = gr.Slider(
373
+ minimum=1,
374
+ maximum=10,
375
+ value=5,
376
+ step=1,
377
+ label="Beam Size (higher = more accurate but slower)"
378
+ )
379
+ whisper_language = gr.Dropdown(
380
+ choices=["", "en", "zh", "fr", "de", "ja", "es", "ru", "ko", "it"],
381
+ value="",
382
+ label="Force Language (optional)"
383
+ )
384
+ whisper_transcribe_btn = gr.Button("Transcribe Audio")
385
+
386
+ with gr.Column():
387
+ whisper_text_output = gr.Textbox(
388
+ label="Transcription Result",
389
+ lines=5,
390
+ interactive=False
391
+ )
392
+ whisper_status = gr.Textbox(
393
+ label="Status",
394
+ interactive=False
395
+ )
396
 
397
  # Examples for Chatterbox
398
  gr.Examples(
 
429
  outputs=[piper_audio_output, piper_status]
430
  )
431
 
432
+ # Connect the Faster Whisper transcribe button to the function
433
+ whisper_transcribe_btn.click(
434
+ fn=generate_faster_whisper_speech,
435
+ inputs=[whisper_audio_input, whisper_beam_size, whisper_language],
436
+ outputs=[whisper_text_output, whisper_status]
437
+ )
438
+
439
  # Update voice dropdown when language changes
440
  piper_language_selection.change(
441
  fn=update_piper_voices,
requirements.txt CHANGED
@@ -6,4 +6,6 @@ soundfile
6
  https://github.com/KittenML/KittenTTS/releases/download/0.1/kittentts-0.1.0-py3-none-any.whl
7
  piper-tts
8
  transformers
9
- accelerate
 
 
 
6
  https://github.com/KittenML/KittenTTS/releases/download/0.1/kittentts-0.1.0-py3-none-any.whl
7
  piper-tts
8
  transformers
9
+ accelerate
10
+ faster-whisper
11
+ librosa