Spaces:

DroolingPanda
/

tts_gallery

Sleeping

App Files Files Community

Michael Hu commited on Sep 27, 2025

Commit

6c4b49c

1 Parent(s): d77f8ff

implement faster whisper

Browse files

Files changed (3) hide show

README.md +1 -0
app.py +117 -1
requirements.txt +3 -1

README.md CHANGED Viewed

@@ -47,6 +47,7 @@ This demo showcases the multilingual capabilities of multiple TTS models, suppor
 - **Chatterbox**: Industrial-grade multilingual TTS solution
 - **KittenTTS**: High-quality TTS with voice cloning capabilities
 - **Piper**: Local on-device TTS with multiple voice options
 ## Examples

 - **Chatterbox**: Industrial-grade multilingual TTS solution
 - **KittenTTS**: High-quality TTS with voice cloning capabilities
 - **Piper**: Local on-device TTS with multiple voice options
+- **Faster Whisper**: High-performance speech recognition model for audio transcription
 ## Examples

app.py CHANGED Viewed

@@ -14,12 +14,14 @@ from transformers import AutoModelForSeq2SeqLM
 import soundfile as sf
 import wave
 import os
 # Model descriptions for better understanding
 MODEL_DESCRIPTIONS = {
     "ResembleAI/chatterbox": "Industrial-grade TTS solution with multilingual support",
     "KittenML/KittenTTS": "High-quality TTS with voice cloning capabilities using reference audio",
     "piper-tts": "Local on-device TTS with dynamic English and Chinese voice selection from Piper models",
 }
 # Models dictionary
@@ -27,6 +29,7 @@ MODELS = {
     "ResembleAI/chatterbox": "Chatterbox",
     "KittenML/KittenTTS": "KittenTTS",
     "piper-tts": "Piper (no voice cloning)",
 }
 original_torch_load = torch.load
@@ -87,6 +90,36 @@ voices_by_lang = scan_piper_voices()
 # No global piper_voice, load dynamically
 def generate_chatterbox_speech(text, language, audio_prompt=None):
     """
     Generate speech from text using Chatterbox multilingual TTS with optional audio prompt
@@ -185,6 +218,50 @@ def update_piper_voices(lang):
     value = choices[0] if choices else None
     return gr.update(choices=choices, value=value)
 def create_model_card(repo: str) -> str:
     """Create a formatted model card with ratings and description."""
     display_name = MODELS[repo]
@@ -283,7 +360,39 @@ with gr.Blocks(css=custom_css, title="🎙️ TTS Model Gallery", theme=gr.theme
             piper_audio_output = gr.Audio(label="Generated Speech", type="filepath")
             piper_status = gr.Textbox(label="Status", interactive=False)
-    # VibeVoice section removed
     # Examples for Chatterbox
     gr.Examples(
@@ -320,6 +429,13 @@ with gr.Blocks(css=custom_css, title="🎙️ TTS Model Gallery", theme=gr.theme
         outputs=[piper_audio_output, piper_status]
     )
     # Update voice dropdown when language changes
     piper_language_selection.change(
         fn=update_piper_voices,

 import soundfile as sf
 import wave
 import os
+from faster_whisper import WhisperModel
 # Model descriptions for better understanding
 MODEL_DESCRIPTIONS = {
     "ResembleAI/chatterbox": "Industrial-grade TTS solution with multilingual support",
     "KittenML/KittenTTS": "High-quality TTS with voice cloning capabilities using reference audio",
     "piper-tts": "Local on-device TTS with dynamic English and Chinese voice selection from Piper models",
+    "SYSTRAN/faster-whisper": "Faster Whisper transcription with CTranslate2, up to 4x faster than OpenAI Whisper",
 }
 # Models dictionary
     "ResembleAI/chatterbox": "Chatterbox",
     "KittenML/KittenTTS": "KittenTTS",
     "piper-tts": "Piper (no voice cloning)",
+    "SYSTRAN/faster-whisper": "Faster Whisper",
 }
 original_torch_load = torch.load
 # No global piper_voice, load dynamically
+# Initialize faster-whisper model
+def initialize_faster_whisper():
+    """Initialize the faster-whisper model with appropriate compute settings"""
+    model_size = "large-v3"
+    try:
+        if torch.cuda.is_available():
+            whisper_model = WhisperModel(model_size, device="cuda", compute_type="float16")
+            print("Loaded faster-whisper on CUDA with FP16")
+        elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
+            # MPS (Apple Silicon) support
+            whisper_model = WhisperModel(model_size, device="cpu", compute_type="int8")
+            print("Loaded faster-whisper on CPU with INT8 (MPS not directly supported)")
+        else:
+            whisper_model = WhisperModel(model_size, device="cpu", compute_type="int8")
+            print("Loaded faster-whisper on CPU with INT8")
+        return whisper_model
+    except Exception as e:
+        print(f"Error loading faster-whisper model: {str(e)}")
+        print("Falling back to small model with INT8 quantization")
+        try:
+            return WhisperModel("small", device="cpu", compute_type="int8")
+        except Exception as e2:
+            print(f"Failed to load fallback model: {str(e2)}")
+            return None
+# Initialize the model
+whisper_model = initialize_faster_whisper()
 def generate_chatterbox_speech(text, language, audio_prompt=None):
     """
     Generate speech from text using Chatterbox multilingual TTS with optional audio prompt
     value = choices[0] if choices else None
     return gr.update(choices=choices, value=value)
+def generate_faster_whisper_speech(audio_file, beam_size=5, language=None):
+    """
+    Transcribe speech from audio file using Faster Whisper
+    Args:
+        audio_file (str): Path to audio file for transcription
+        beam_size (int): Beam size for transcription (higher = more accurate but slower)
+        language (str, optional): Language code to force for transcription
+    Returns:
+        tuple: (transcription_text, error_msg) - text if success, empty and error if fail
+    """
+    if not audio_file or not os.path.exists(audio_file):
+        return "", "Please upload an audio file to transcribe."
+    if whisper_model is None:
+        return "", "Faster Whisper model failed to initialize."
+    try:
+        # Set up transcription parameters
+        transcribe_options = {
+            "beam_size": beam_size,
+            "language": language if language else None,
+            "task": "transcribe"
+        }
+        # Remove None values
+        transcribe_options = {k: v for k, v in transcribe_options.items() if v is not None}
+        # Perform transcription
+        segments, info = whisper_model.transcribe(audio_file, **transcribe_options)
+        # Collect all segments into a single text
+        result = ""
+        for segment in segments:
+            result += segment.text + " "
+        # Add language detection info
+        detected_info = f"\n\nDetected language: {info.language} (probability: {info.language_probability:.2f})"
+        return result.strip(), detected_info
+    except Exception as e:
+        return "", f"Error transcribing audio: {str(e)}"
 def create_model_card(repo: str) -> str:
     """Create a formatted model card with ratings and description."""
     display_name = MODELS[repo]
             piper_audio_output = gr.Audio(label="Generated Speech", type="filepath")
             piper_status = gr.Textbox(label="Status", interactive=False)
+    # Faster Whisper section
+    whisper_model_info = gr.HTML(create_model_card("SYSTRAN/faster-whisper"))
+    with gr.Row():
+        with gr.Column():
+            whisper_audio_input = gr.Audio(
+                label="Upload Audio for Transcription",
+                type="filepath"
+            )
+            whisper_beam_size = gr.Slider(
+                minimum=1,
+                maximum=10,
+                value=5,
+                step=1,
+                label="Beam Size (higher = more accurate but slower)"
+            )
+            whisper_language = gr.Dropdown(
+                choices=["", "en", "zh", "fr", "de", "ja", "es", "ru", "ko", "it"],
+                value="",
+                label="Force Language (optional)"
+            )
+            whisper_transcribe_btn = gr.Button("Transcribe Audio")
+        with gr.Column():
+            whisper_text_output = gr.Textbox(
+                label="Transcription Result",
+                lines=5,
+                interactive=False
+            )
+            whisper_status = gr.Textbox(
+                label="Status",
+                interactive=False
+            )
     # Examples for Chatterbox
     gr.Examples(
         outputs=[piper_audio_output, piper_status]
     )
+    # Connect the Faster Whisper transcribe button to the function
+    whisper_transcribe_btn.click(
+        fn=generate_faster_whisper_speech,
+        inputs=[whisper_audio_input, whisper_beam_size, whisper_language],
+        outputs=[whisper_text_output, whisper_status]
+    )
     # Update voice dropdown when language changes
     piper_language_selection.change(
         fn=update_piper_voices,

requirements.txt CHANGED Viewed

@@ -6,4 +6,6 @@ soundfile
 https://github.com/KittenML/KittenTTS/releases/download/0.1/kittentts-0.1.0-py3-none-any.whl
 piper-tts
 transformers
-accelerate

 https://github.com/KittenML/KittenTTS/releases/download/0.1/kittentts-0.1.0-py3-none-any.whl
 piper-tts
 transformers
+accelerate
+faster-whisper
+librosa