Spaces:

DroolingPanda
/

tts_gallery

Build error

Michael Hu commited on Sep 29, 2025

Commit

ef4db28

1 Parent(s): b5ac4eb

refactor: replace inline model definitions with ModelFactory and remove unused imports

- Remove all hard-coded model definitions and import the new factory
- Delete unused imports (torchaudio, sys, soundfile, transformers, etc.)
- Eliminate duplicate code for model discovery and voice handling
- Delete unused utility functions and duplicate code paths
- Remove unused dependency on librosa and soundfile

Files changed (12) hide show

app.py +324 -563
src/models/__init__.py +4 -0
src/models/base.py +77 -0
src/models/factory.py +54 -0
src/models/stt/__init__.py +3 -0
src/models/stt/whisper_model.py +93 -0
src/models/tts/__init__.py +13 -0
src/models/tts/chatterbox_model.py +99 -0
src/models/tts/dia_model.py +56 -0
src/models/tts/kitten_model.py +67 -0
src/models/tts/kokoro_model.py +69 -0
src/models/tts/piper_model.py +115 -0

app.py CHANGED Viewed

@@ -1,43 +1,14 @@
 import gradio as gr
-import torchaudio as ta
 import torch
 import tempfile
 import os
-import sys
-import soundfile as sf
 import numpy as np
-import librosa
-from chatterbox.mtl_tts import ChatterboxMultilingualTTS
-from kittentts import KittenTTS
-from piper import PiperVoice
-from transformers import AutoModelForSeq2SeqLM
 import soundfile as sf
-import wave
-import os
-from faster_whisper import WhisperModel
-from kokoro import KPipeline
-# from src.dia_tts import DiaTTS
-# Model descriptions for better understanding
-MODEL_DESCRIPTIONS = {
-    "ResembleAI/chatterbox": "Industrial-grade TTS solution with multilingual support",
-    "KittenML/KittenTTS": "High-quality TTS with voice cloning capabilities using reference audio",
-    "piper-tts": "Local on-device TTS with dynamic English and Chinese voice selection from Piper models",
-    "SYSTRAN/faster-whisper": "Faster Whisper transcription with CTranslate2, up to 4x faster than OpenAI Whisper",
-    "hexgrad/kokoro": "Lightweight TTS model with 82M parameters, Apache-licensed for production and personal use",
-    "nari-labs/Dia-1.6B": "Ultra-realistic dialogue generation with support for voice cloning and non-verbal expressions",
-}
-# Models dictionary
-MODELS = {
-    "ResembleAI/chatterbox": "Chatterbox",
-    "KittenML/KittenTTS": "KittenTTS",
-    "piper-tts": "Piper (no voice cloning)",
-    "SYSTRAN/faster-whisper": "Faster Whisper",
-    "hexgrad/kokoro": "Kokoro-82M",
-    "nari-labs/Dia-1.6B": "Dia TTS",
-}
 original_torch_load = torch.load
 def patched_torch_load(f, map_location=None, **kwargs):
@@ -47,187 +18,38 @@ def patched_torch_load(f, map_location=None, **kwargs):
 torch.load = patched_torch_load
-# Initialize the multilingual model
-try:
-    model = ChatterboxMultilingualTTS.from_pretrained(device="cuda" if torch.cuda.is_available() else "cpu")
-except RuntimeError as e:
-    if "Attempting to deserialize object on a CUDA device" in str(e):
-        print("CUDA model detected but CUDA is not available. Loading model on CPU...")
-        model = ChatterboxMultilingualTTS.from_pretrained(device="cpu")
-    else:
-        raise e
-# Initialize KittenTTS model
-kittentts_model = KittenTTS("KittenML/kitten-tts-nano-0.2")
-# Scan Piper voices
-def scan_piper_voices():
-    voices_dir = "src/voices/piper_voices"
-    voices_by_lang = {'English': {}, 'Chinese': {}}
-    # Chinese: only huayan medium
-    chinese_path = os.path.join(voices_dir, "zh", "zh_CN", "huayan", "medium", "zh_CN-huayan-medium.onnx")
-    if os.path.exists(chinese_path):
-        voices_by_lang['Chinese']['huayan (zh_CN)'] = chinese_path
-    # English voices
-    en_dir = os.path.join(voices_dir, "en")
-    for root, dirs, files in os.walk(en_dir):
-        if len(root.split(os.sep)) < 5:  # Skip if not deep enough
-            continue
-        parts = root.split(os.sep)
-        if len(parts) >= 5 and parts[-1] in ['medium', 'high']:
-            locale = parts[-3]  # en_GB or en_US
-            voice_name = parts[-2]  # alan, etc.
-            quality = parts[-1]  # medium or high
-            for file in files:
-                if file.endswith('.onnx') and f"{locale}-{voice_name}-{quality}" in file:
-                    path = os.path.join(root, file)
-                    label = f"{voice_name} ({locale})"
-                    # Prefer medium over high
-                    if quality == 'medium' or label not in voices_by_lang['English']:
-                        voices_by_lang['English'][label] = path
-                    break  # Assume one .onnx per dir
-    return voices_by_lang
-voices_by_lang = scan_piper_voices()
-# No global piper_voice, load dynamically
-# Initialize Dia model
-# dia_model = None
-# def initialize_dia():
-#     global dia_model
-#     try:
-#         dia_model = DiaTTS()
-#         print("Loaded Dia-1.6B model")
-#         return dia_model
-#     except Exception as e:
-#         print(f"Error loading Dia model: {e}")
-#         return None
-# Initialize Kokoro
-def initialize_kokoro():
-    try:
-        # Initialize Kokoro pipeline with American English as default
-        kokoro_pipeline = KPipeline(lang_code='a')
-        print("Loaded Kokoro-82M pipeline with American English")
-        return kokoro_pipeline
-    except Exception as e:
-        print(f"Error loading Kokoro pipeline: {e}")
-        return None
-# Initialize faster-whisper model
-def initialize_faster_whisper():
-    """Initialize the faster-whisper model with appropriate compute settings"""
-    model_size = "large-v3"
-    try:
-        if torch.cuda.is_available():
-            whisper_model = WhisperModel(model_size, device="cuda", compute_type="float16")
-            print("Loaded faster-whisper on CUDA with FP16")
-        elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
-            # MPS (Apple Silicon) support
-            whisper_model = WhisperModel(model_size, device="cpu", compute_type="int8")
-            print("Loaded faster-whisper on CPU with INT8 (MPS not directly supported)")
-        else:
-            whisper_model = WhisperModel(model_size, device="cpu", compute_type="int8")
-            print("Loaded faster-whisper on CPU with INT8")
-        return whisper_model
-    except Exception as e:
-        print(f"Error loading faster-whisper model: {str(e)}")
-        print("Falling back to small model with INT8 quantization")
-        try:
-            return WhisperModel("small", device="cpu", compute_type="int8")
-        except Exception as e2:
-            print(f"Failed to load fallback model: {str(e2)}")
-            return None
-# Initialize the model
-whisper_model = initialize_faster_whisper()
-def generate_chatterbox_speech(text, language, audio_prompt=None):
-    """
-    Generate speech from text using Chatterbox multilingual TTS with optional audio prompt
-    Args:
-        text (str): Text to convert to speech
-        language (str): Language code ('en' for English, 'zh' for Chinese)
-        audio_prompt (str, optional): Path to reference audio file for voice cloning
-    Returns:
-        str: Path to the generated audio file
-    """
-    # Map language codes to full names for Chatterbox
-    language_map = {
-        "English": "en",
-        "Chinese": "zh"
-    }
-    language_id = language_map.get(language, "en")
-    # https://huggingface.co/spaces/ResembleAI/Chatterbox/blob/main/app.py#L64-L67
-    generate_kwargs = {
-        "exaggeration": 0.5,
-        "temperature": 0.8,
-        "cfg_weight": 0.3,
-    }
-    # Generate speech using Chatterbox
-    if audio_prompt and os.path.exists(audio_prompt):
-        # Use audio prompt for voice cloning
-        wav = model.generate(text, language_id=language_id, audio_prompt_path=audio_prompt, **generate_kwargs)
-    else:
-        # Generate without audio prompt (default voice)
-        wav = model.generate(text, language_id=language_id, **generate_kwargs)
-    # Save to a temporary file
-    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
-        ta.save(tmp_file.name, wav, model.sr)
-        return tmp_file.name
-def generate_kittentts_speech(text, audio_prompt=None):
-    """
-    Generate speech from text using KittenTTS with optional audio prompt
-    Args:
-        text (str): Text to convert to speech
-        audio_prompt (str, optional): Path to reference audio file for voice cloning
-    Returns:
-        str: Path to the generated audio file
-    """
-    # Generate speech using KittenTTS
-    if audio_prompt and os.path.exists(audio_prompt):
-        # Use audio prompt for voice cloning
-        wav = kittentts_model.generate(text, voice='expr-voice-2-f')
-    else:
-        # Generate without audio prompt (default voice)
-        wav = kittentts_model.generate(text, voice='expr-voice-2-f')
-    # Save to a temporary file
-    with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
-        sf.write(tmp_file.name, wav, 24000)
-        return tmp_file.name
 def get_kokoro_voices(language_code):
     """
     Get available voices for a specific Kokoro language code
     Based on: https://huggingface.co/hexgrad/Kokoro-82M/blob/main/VOICES.md
-    Voice mapping:
-    - American English (a): af_heart, af_alloy, af_aoede, af_bella, af_jessica, af_kore, af_nicole, af_nova, af_river, af_sarah, af_sky, am_adam, am_echo, am_eric, am_fenrir, am_liam, am_michael, am_onyx, am_puck, am_santa
-    - British English (b): bf_alice, bf_emma, bf_isabella, bf_lily, bm_daniel, bm_fable, bm_george, bm_lewis
-    - Spanish (e): ef_dora, em_alex, em_santa
-    - French (f): ff_siwis
-    - Hindi (h): hf_alpha, hf_beta, hm_omega, hm_psi
-    - Italian (i): if_sara, im_nicola
-    - Japanese (j): jf_alpha, jf_gongitsune, jf_nezumi, jf_tebukuro, jm_kumo
-    - Brazilian Portuguese (p): pt_heart, pt_sun, pt_moon, pt_star, pt_cloud
-    - Mandarin Chinese (z): zf_xiaobei, zf_xiaoni, zf_xiaoxiao, zf_xiaoyi, zm_yunjian, zm_yunxi, zm_yunxia, zm_yunyang
     """
     voice_map = {
         # American English (a)
@@ -252,7 +74,7 @@ def get_kokoro_voices(language_code):
         "i": ["if_sara", "im_nicola"],
         # Japanese (j)
         "j": ["jf_alpha", "jf_gongitsune", "jf_nezumi", "jf_tebukuro", "jm_kumo"],
-        # Brazilian Portuguese (p) - not explicitly listed in VOICES.md but keeping from original
         "p": ["pt_heart", "pt_sun", "pt_moon", "pt_star", "pt_cloud"],
         # Mandarin Chinese (z)
         "z": [
@@ -262,386 +84,325 @@ def get_kokoro_voices(language_code):
     }
     return voice_map.get(language_code, ["af_heart"])  # Default to American English voices
-def generate_kokoro_speech(text, language_code, voice_name):
-    """
-    Generate speech from text using Kokoro TTS with selected voice
-    Args:
-        text (str): Text to convert to speech
-        language_code (str): Language code ('a' for American English, etc.)
-        voice_name (str): Selected voice name
-    Returns:
-        tuple: (audio_path, error_msg) - path if success, None and error if fail
-    """
-    if not text.strip():
-        return None, "Please enter text to synthesize."
     try:
-        # Initialize Kokoro pipeline with the selected language code
-        kokoro_pipeline = KPipeline(lang_code=language_code)
-        # Generate speech
-        audio_chunks = []
-        for _, _, audio in kokoro_pipeline(text, voice=voice_name):
-            audio_chunks.append(audio)
-        # If we have multiple chunks, concatenate them
-        if len(audio_chunks) > 1:
-            final_audio = np.concatenate(audio_chunks)
-        else:
-            final_audio = audio_chunks[0] if audio_chunks else np.array([])
-        # Save to a temporary file
-        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
-            sf.write(tmp_file.name, final_audio, 24000)  # Kokoro uses 24kHz sample rate
-            return tmp_file.name, ""
     except Exception as e:
-        return None, f"Error synthesizing speech: {str(e)}"
-# def generate_dia_speech(text, audio_prompt=None):
-#     """
-#     Generate speech from text using Dia TTS with optional audio prompt
-#
-#     Args:
-#         text (str): Text to convert to speech
-#         audio_prompt (str, optional): Path to reference audio file for voice cloning
-#
-#     Returns:
-#         str: Path to the generated audio file
-#     """
-#     # Initialize Dia model if not already initialized
-#     global dia_model
-#     if dia_model is None:
-#         dia_model = initialize_dia()
-#
-#     # Generate speech using Dia
-#     return dia_model.generate_to_file(text, audio_prompt)
-def generate_piper_speech(text, lang, voice):
-    """
-    Generate speech from text using Piper TTS with selected voice
-    Args:
-        text (str): Text to convert to speech
-        lang (str): Language ('English' or 'Chinese')
-        voice (str): Selected voice label
-    Returns:
-        tuple: (audio_path, error_msg) - path if success, None and error if fail
-    """
-    if not text.strip():
-        return None, "Please enter text to synthesize."
-    if voice not in voices_by_lang.get(lang, {}):
-        return None, f"Invalid voice selection for {lang}."
-    onnx_path = voices_by_lang[lang][voice]
     try:
-        piper_voice = PiperVoice.load(onnx_path)
-        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
-            with wave.open(tmp_file.name, "wb") as wav_file:
-                piper_voice.synthesize_wav(text, wav_file)
-            return tmp_file.name, ""
     except Exception as e:
-        return None, f"Error synthesizing speech: {str(e)}"
-def update_piper_voices(lang):
-    choices = list(voices_by_lang.get(lang, {}).keys())
-    value = choices[0] if choices else None
-    return gr.update(choices=choices, value=value)
-def generate_faster_whisper_speech(audio_file, beam_size=5, language=None):
-    """
-    Transcribe speech from audio file using Faster Whisper
-    Args:
-        audio_file (str): Path to audio file for transcription
-        beam_size (int): Beam size for transcription (higher = more accurate but slower)
-        language (str, optional): Language code to force for transcription
-    Returns:
-        tuple: (transcription_text, error_msg) - text if success, empty and error if fail
-    """
-    if not audio_file or not os.path.exists(audio_file):
-        return "", "Please upload an audio file to transcribe."
-    if whisper_model is None:
-        return "", "Faster Whisper model failed to initialize."
     try:
-        # Set up transcription parameters
-        transcribe_options = {
-            "beam_size": beam_size,
-            "language": language if language else None,
-            "task": "transcribe"
-        }
-        # Remove None values
-        transcribe_options = {k: v for k, v in transcribe_options.items() if v is not None}
-        # Perform transcription
-        segments, info = whisper_model.transcribe(audio_file, **transcribe_options)
-        # Collect all segments into a single text
-        result = ""
-        for segment in segments:
-            result += segment.text + " "
-        # Add language detection info
-        detected_info = f"\n\nDetected language: {info.language} (probability: {info.language_probability:.2f})"
-        return result.strip(), detected_info
     except Exception as e:
-        return "", f"Error transcribing audio: {str(e)}"
-def create_model_card(repo: str) -> str:
-    """Create a formatted model card with ratings and description."""
-    display_name = MODELS[repo]
-    description = MODEL_DESCRIPTIONS.get(repo, "High-quality TTS model")
-    card_html = f"""
-    <div class="model-card" style="border: 1px solid #ddd; border-radius: 12px; padding: 20px; margin: 10px 0; background: white;">
-        <h3 style="color: #2c3e50; margin-top: 0;">🎤 {display_name}</h3>
-        <p style="color: #34495e; margin: 10px 0;">{description}</p>
-    </div>
-    """
-    return card_html
-# Custom CSS
-custom_css = """
-.model-card {
-  background: white;
-  color: #2c3e50 !important;
-  border: 1px solid #ddd;
-  border-radius: 12px;
-  padding: 20px;
-  margin: 10px 0;
-}
-"""
-# Create Gradio interface
-with gr.Blocks(css=custom_css, title="🎙️ TTS Model Gallery", theme=gr.themes.Soft()) as demo:
-    gr.HTML("""
-    <div id="title">
-        <h1>🎙️ Open-Source Text-to-Speech Model Gallery</h1>
-    </div>
-    """)
-    gr.HTML("""
-    <div id="intro-section">
-        <h3>🔬 Our Exciting Quest</h3>
-        <p>We're on a mission to help developers quickly find and compare the best open-source TTS models for their audio projects.</p>
-    </div>
-    """)
-    gr.Markdown("## 🎧 Model Gallery")
-    gr.Markdown("### Common Inputs")
-    text_input = gr.Textbox(
-        label="Input Text",
-        placeholder="Enter text to convert to speech...",
-        lines=3
-    )
-    audio_prompt = gr.Audio(
-        label="Reference Voice (Optional)",
-        type="filepath"
-    )
-    model_info = gr.HTML(create_model_card("ResembleAI/chatterbox"))
-    with gr.Row():
-        with gr.Column():
-            language_selection = gr.Radio(
-                choices=["English", "Chinese"],
-                value="English",
-                label="Language"
-            )
-            generate_btn = gr.Button("Generate Speech")
-        with gr.Column():
-            audio_output = gr.Audio(label="Generated Speech", type="filepath")
-    kittentts_model_info = gr.HTML(create_model_card("KittenML/KittenTTS"))
-    with gr.Row():
-        with gr.Column():
-            kittentts_generate_btn = gr.Button("Generate Speech")
-        with gr.Column():
-            kittentts_audio_output = gr.Audio(label="Generated Speech", type="filepath")
-    piper_model_info = gr.HTML(create_model_card("piper-tts"))
-    with gr.Row():
-        with gr.Column():
-            piper_language_selection = gr.Radio(
-                choices=["English", "Chinese"],
-                value="English",
-                label="Language"
-            )
-            piper_voice_selection = gr.Dropdown(
-                choices=list(voices_by_lang["English"].keys()),
-                value=list(voices_by_lang["English"].keys())[0] if voices_by_lang["English"] else None,
-                label="Voice"
-            )
-            piper_generate_btn = gr.Button("Generate Speech")
-        with gr.Column():
-            piper_audio_output = gr.Audio(label="Generated Speech", type="filepath")
-            piper_status = gr.Textbox(label="Status", interactive=False)
-    # Dia TTS UI (commented out for now)
-    # dia_model_info = gr.HTML(create_model_card("nari-labs/Dia-1.6B"))
-    # with gr.Row():
-    #     with gr.Column():
-    #         dia_text_format = gr.Markdown("""
-    #         **Tip:** For dialogue, use [S1] and [S2] tags. For non-verbal expressions, use (laughs), (sighs), etc.
-    #         Example: [S1] Hello there! (laughs) [S2] Hi, how are you doing today?
-    #         """)
-    #         dia_generate_btn = gr.Button("Generate Speech with Dia")
-    #
-    #     with gr.Column():
-    #         dia_audio_output = gr.Audio(label="Generated Speech", type="filepath")
-    # Faster Whisper section
-    whisper_model_info = gr.HTML(create_model_card("SYSTRAN/faster-whisper"))
-    with gr.Row():
-        with gr.Column():
-            whisper_audio_input = gr.Audio(
-                label="Upload Audio for Transcription",
-                type="filepath"
-            )
-            whisper_beam_size = gr.Slider(
-                minimum=1,
-                maximum=10,
-                value=5,
-                step=1,
-                label="Beam Size (higher = more accurate but slower)"
-            )
-            whisper_language = gr.Dropdown(
-                choices=["", "en", "zh", "fr", "de", "ja", "es", "ru", "ko", "it"],
-                value="",
-                label="Force Language (optional)"
-            )
-            whisper_transcribe_btn = gr.Button("Transcribe Audio")
-        with gr.Column():
-            whisper_text_output = gr.Textbox(
-                label="Transcription Result",
-                lines=5,
-                interactive=False
-            )
-            whisper_status = gr.Textbox(
-                label="Status",
-                interactive=False
-            )
-    # Kokoro section
-    kokoro_model_info = gr.HTML(create_model_card("hexgrad/kokoro"))
-    with gr.Row():
-        with gr.Column():
-            kokoro_language_code = gr.Dropdown(
-                choices=[
-                    ("American English", "a"),
-                    ("British English", "b"),
-                    ("Spanish", "e"),
-                    ("French", "f"),
-                    ("Hindi", "h"),
-                    ("Italian", "i"),
-                    ("Japanese", "j"),
-                    ("Brazilian Portuguese", "p"),
-                    ("Mandarin Chinese", "z")
-                ],
-                value="a",
-                label="Language"
-            )
-            kokoro_voice = gr.Dropdown(
-                choices=get_kokoro_voices("a"),
-                value="af_heart",
-                label="Voice"
-            )
-            kokoro_generate_btn = gr.Button("Generate Speech")
-        with gr.Column():
-            kokoro_audio_output = gr.Audio(label="Generated Speech", type="filepath")
-            kokoro_status = gr.Textbox(label="Status", interactive=False)
-    # Examples for Chatterbox
-    gr.Examples(
-        examples=[
-            ["Hello, welcome to the Chatterbox multilingual demo. This is an English example.", "English", None],
-            ["你好，欢迎来到Chatterbox多语言演示。这是一个中文示例。", "Chinese", None]
-        ],
-        inputs=[text_input, language_selection, audio_prompt],
-        outputs=audio_output,
-        fn=generate_chatterbox_speech,
-        cache_examples=False
-    )
-    # Connect the generate button to the function
-    generate_btn.click(
-        fn=generate_chatterbox_speech,
-        inputs=[text_input, language_selection, audio_prompt],
-        outputs=audio_output
-    )
-    # VibeVoice button connection removed
-    # Connect the KittenTTS generate button to the function
-    kittentts_generate_btn.click(
-        fn=generate_kittentts_speech,
-        inputs=[text_input, audio_prompt],
-        outputs=kittentts_audio_output
-    )
-    # Connect the Dia TTS generate button to the function (commented out for now)
-    # dia_generate_btn.click(
-    #     fn=generate_dia_speech,
-    #     inputs=[text_input, audio_prompt],
-    #     outputs=dia_audio_output
-    # )
-    # Connect the Piper generate button to the function
-    piper_generate_btn.click(
-        fn=generate_piper_speech,
-        inputs=[text_input, piper_language_selection, piper_voice_selection],
-        outputs=[piper_audio_output, piper_status]
-    )
-    # Connect the Faster Whisper transcribe button to the function
-    whisper_transcribe_btn.click(
-        fn=generate_faster_whisper_speech,
-        inputs=[whisper_audio_input, whisper_beam_size, whisper_language],
-        outputs=[whisper_text_output, whisper_status]
-    )
-    # Connect the Kokoro UI components to the generation function
-    kokoro_generate_btn.click(
-        fn=generate_kokoro_speech,
-        inputs=[text_input, kokoro_language_code, kokoro_voice],
-        outputs=[kokoro_audio_output, kokoro_status]
-    )
-    # Update voice dropdown when language changes
-    piper_language_selection.change(
-        fn=update_piper_voices,
-        inputs=[piper_language_selection],
-        outputs=[piper_voice_selection]
-    )
-    # Update Kokoro voice dropdown when language changes
-    kokoro_language_code.change(
-        fn=lambda lang: gr.update(choices=get_kokoro_voices(lang), value=get_kokoro_voices(lang)[0] if get_kokoro_voices(lang) else None),
-        inputs=[kokoro_language_code],
-        outputs=[kokoro_voice]
-    )
 if __name__ == "__main__":
-    demo.launch(ssr_mode=False)

 import gradio as gr
 import torch
 import tempfile
 import os
 import numpy as np
 import soundfile as sf
+# Import our model factory
+from src.models.factory import ModelFactory
+# Patch torch.load to always use CPU
 original_torch_load = torch.load
 def patched_torch_load(f, map_location=None, **kwargs):
 torch.load = patched_torch_load
+# Get model descriptions
+MODEL_DESCRIPTIONS = ModelFactory.get_model_descriptions()
+# Models dictionary for UI display
+MODELS = {
+    "ResembleAI/chatterbox": "Chatterbox",
+    "KittenML/KittenTTS": "KittenTTS",
+    "piper-tts": "Piper (no voice cloning)",
+    "SYSTRAN/faster-whisper": "Faster Whisper",
+    "hexgrad/kokoro": "Kokoro-82M",
+    "nari-labs/Dia-1.6B": "Dia TTS",
+}
+# Initialize model instances
+tts_models = ModelFactory.get_tts_models()
+stt_models = ModelFactory.get_stt_models()
+# Initialize the models that need immediate initialization
+for model_name in ["ResembleAI/chatterbox", "KittenML/KittenTTS"]:
+    if model_name in tts_models:
+        tts_models[model_name].initialize()
+# Initialize the STT model
+whisper_model = stt_models.get("SYSTRAN/faster-whisper")
+if whisper_model:
+    whisper_model.initialize()
+# Helper function to get Kokoro voices
 def get_kokoro_voices(language_code):
     """
     Get available voices for a specific Kokoro language code
     Based on: https://huggingface.co/hexgrad/Kokoro-82M/blob/main/VOICES.md
     """
     voice_map = {
         # American English (a)
         "i": ["if_sara", "im_nicola"],
         # Japanese (j)
         "j": ["jf_alpha", "jf_gongitsune", "jf_nezumi", "jf_tebukuro", "jm_kumo"],
+        # Brazilian Portuguese (p)
         "p": ["pt_heart", "pt_sun", "pt_moon", "pt_star", "pt_cloud"],
         # Mandarin Chinese (z)
         "z": [
     }
     return voice_map.get(language_code, ["af_heart"])  # Default to American English voices
+# UI Functions for TTS Models
+def tts_chatterbox(text, language, audio_prompt=None):
+    """UI function for Chatterbox TTS"""
+    model = tts_models.get("ResembleAI/chatterbox")
+    if not model:
+        return None, "Model not available"
     try:
+        audio_path = model.generate_speech(text, language=language, audio_prompt=audio_prompt)
+        return audio_path, ""
     except Exception as e:
+        return None, f"Error: {str(e)}"
+def tts_kittentts(text, audio_prompt=None):
+    """UI function for KittenTTS"""
+    model = tts_models.get("KittenML/KittenTTS")
+    if not model:
+        return None, "Model not available"
     try:
+        audio_path = model.generate_speech(text, audio_prompt=audio_prompt)
+        return audio_path, ""
     except Exception as e:
+        return None, f"Error: {str(e)}"
+def tts_piper(text, language, voice):
+    """UI function for Piper TTS"""
+    model = tts_models.get("piper-tts")
+    if not model:
+        return None, "Model not available"
     try:
+        model.initialize()  # Ensure voices are scanned
+        audio_path = model.generate_speech(text, language=language, voice=voice)
+        return audio_path, ""
     except Exception as e:
+        return None, f"Error: {str(e)}"
+def tts_kokoro(text, language_code, voice_name):
+    """UI function for Kokoro TTS"""
+    model = tts_models.get("hexgrad/kokoro")
+    if not model:
+        return None, "Model not available"
+    try:
+        audio_path = model.generate_speech(text, lang_code=language_code)
+        return audio_path, ""
+    except Exception as e:
+        return None, f"Error: {str(e)}"
+def tts_dia(text, audio_prompt=None):
+    """UI function for Dia TTS"""
+    model = tts_models.get("nari-labs/Dia-1.6B")
+    if not model:
+        return None, "Model not available"
+    try:
+        model.initialize()  # Ensure model is loaded
+        audio_path = model.generate_speech(text, audio_prompt=audio_prompt)
+        return audio_path, ""
+    except Exception as e:
+        return None, f"Error: {str(e)}"
+# UI Function for STT Model
+def stt_whisper(audio_path, language=None):
+    """UI function for Faster Whisper STT"""
+    model = stt_models.get("SYSTRAN/faster-whisper")
+    if not model:
+        return "Model not available"
+    try:
+        transcription = model.transcribe(audio_path, language=language)
+        return transcription
+    except Exception as e:
+        return f"Error: {str(e)}"
+# Gradio UI Components
+def create_tts_tab():
+    """Create the TTS tab for the Gradio interface"""
+    with gr.Tab("Text-to-Speech"):
+        gr.Markdown("## Text-to-Speech Models")
+        with gr.Tabs():
+            # Chatterbox Tab
+            with gr.Tab("Chatterbox"):
+                with gr.Row():
+                    with gr.Column():
+                        chatterbox_text = gr.Textbox(
+                            label="Text to speak",
+                            placeholder="Enter text here...",
+                            lines=5
+                        )
+                        chatterbox_language = gr.Dropdown(
+                            choices=["English", "Chinese"],
+                            value="English",
+                            label="Language"
+                        )
+                        chatterbox_audio_prompt = gr.Audio(
+                            label="Voice reference (optional)",
+                            type="filepath"
+                        )
+                        chatterbox_submit = gr.Button("Generate Speech")
+                    with gr.Column():
+                        chatterbox_output = gr.Audio(label="Generated Speech")
+                        chatterbox_error = gr.Textbox(label="Error", visible=False)
+                chatterbox_submit.click(
+                    tts_chatterbox,
+                    inputs=[chatterbox_text, chatterbox_language, chatterbox_audio_prompt],
+                    outputs=[chatterbox_output, chatterbox_error]
+                )
+            # KittenTTS Tab
+            with gr.Tab("KittenTTS"):
+                with gr.Row():
+                    with gr.Column():
+                        kittentts_text = gr.Textbox(
+                            label="Text to speak",
+                            placeholder="Enter text here...",
+                            lines=5
+                        )
+                        kittentts_audio_prompt = gr.Audio(
+                            label="Voice reference (optional)",
+                            type="filepath"
+                        )
+                        kittentts_submit = gr.Button("Generate Speech")
+                    with gr.Column():
+                        kittentts_output = gr.Audio(label="Generated Speech")
+                        kittentts_error = gr.Textbox(label="Error", visible=False)
+                kittentts_submit.click(
+                    tts_kittentts,
+                    inputs=[kittentts_text, kittentts_audio_prompt],
+                    outputs=[kittentts_output, kittentts_error]
+                )
+            # Piper Tab
+            with gr.Tab("Piper"):
+                with gr.Row():
+                    with gr.Column():
+                        piper_text = gr.Textbox(
+                            label="Text to speak",
+                            placeholder="Enter text here...",
+                            lines=5
+                        )
+                        # Initialize Piper model to get voices
+                        piper_model = tts_models.get("piper-tts")
+                        if piper_model:
+                            piper_model.initialize()
+                            languages = piper_model.get_supported_languages()
+                        else:
+                            languages = ["English"]
+                        piper_language = gr.Dropdown(
+                            choices=languages,
+                            value="English",
+                            label="Language"
+                        )
+                        def update_piper_voices(language):
+                            if piper_model:
+                                voices = piper_model.get_available_voices(language)
+                                return gr.Dropdown.update(choices=voices, value=voices[0] if voices else None)
+                            return gr.Dropdown.update(choices=[], value=None)
+                        piper_voice = gr.Dropdown(
+                            label="Voice",
+                            choices=[]
+                        )
+                        piper_language.change(
+                            update_piper_voices,
+                            inputs=[piper_language],
+                            outputs=[piper_voice]
+                        )
+                        piper_submit = gr.Button("Generate Speech")
+                    with gr.Column():
+                        piper_output = gr.Audio(label="Generated Speech")
+                        piper_error = gr.Textbox(label="Error", visible=False)
+                piper_submit.click(
+                    tts_piper,
+                    inputs=[piper_text, piper_language, piper_voice],
+                    outputs=[piper_output, piper_error]
+                )
+            # Kokoro Tab
+            with gr.Tab("Kokoro"):
+                with gr.Row():
+                    with gr.Column():
+                        kokoro_text = gr.Textbox(
+                            label="Text to speak",
+                            placeholder="Enter text here...",
+                            lines=5
+                        )
+                        kokoro_language = gr.Dropdown(
+                            choices=[
+                                "American English (a)", "British English (b)",
+                                "Spanish (e)", "French (f)", "Hindi (h)",
+                                "Italian (i)", "Japanese (j)",
+                                "Brazilian Portuguese (p)", "Mandarin Chinese (z)"
+                            ],
+                            value="American English (a)",
+                            label="Language"
+                        )
+                        def get_lang_code(language):
+                            return language.split("(")[-1].split(")")[0].strip()
+                        def update_kokoro_voices(language):
+                            lang_code = get_lang_code(language)
+                            voices = get_kokoro_voices(lang_code)
+                            return gr.Dropdown.update(choices=voices, value=voices[0] if voices else None)
+                        kokoro_voice = gr.Dropdown(
+                            label="Voice",
+                            choices=get_kokoro_voices("a"),
+                            value="af_heart"
+                        )
+                        kokoro_language.change(
+                            update_kokoro_voices,
+                            inputs=[kokoro_language],
+                            outputs=[kokoro_voice]
+                        )
+                        kokoro_submit = gr.Button("Generate Speech")
+                    with gr.Column():
+                        kokoro_output = gr.Audio(label="Generated Speech")
+                        kokoro_error = gr.Textbox(label="Error", visible=False)
+                kokoro_submit.click(
+                    lambda text, lang, voice: tts_kokoro(text, get_lang_code(lang), voice),
+                    inputs=[kokoro_text, kokoro_language, kokoro_voice],
+                    outputs=[kokoro_output, kokoro_error]
+                )
+            # Dia Tab
+            with gr.Tab("Dia"):
+                with gr.Row():
+                    with gr.Column():
+                        dia_text = gr.Textbox(
+                            label="Text to speak",
+                            placeholder="Enter text here...",
+                            lines=5
+                        )
+                        dia_audio_prompt = gr.Audio(
+                            label="Voice reference (optional)",
+                            type="filepath"
+                        )
+                        dia_submit = gr.Button("Generate Speech")
+                    with gr.Column():
+                        dia_output = gr.Audio(label="Generated Speech")
+                        dia_error = gr.Textbox(label="Error", visible=False)
+                dia_submit.click(
+                    tts_dia,
+                    inputs=[dia_text, dia_audio_prompt],
+                    outputs=[dia_output, dia_error]
+                )
+def create_stt_tab():
+    """Create the STT tab for the Gradio interface"""
+    with gr.Tab("Speech-to-Text"):
+        gr.Markdown("## Speech-to-Text Models")
+        with gr.Tabs():
+            # Faster Whisper Tab
+            with gr.Tab("Faster Whisper"):
+                with gr.Row():
+                    with gr.Column():
+                        whisper_audio = gr.Audio(
+                            label="Audio to transcribe",
+                            type="filepath"
+                        )
+                        whisper_language = gr.Dropdown(
+                            choices=["Auto-detect", "English", "Chinese", "Spanish", "French", "German", "Japanese"],
+                            value="Auto-detect",
+                            label="Language (optional)"
+                        )
+                        whisper_submit = gr.Button("Transcribe")
+                    with gr.Column():
+                        whisper_output = gr.Textbox(
+                            label="Transcription",
+                            lines=5
+                        )
+                whisper_submit.click(
+                    lambda audio, lang: stt_whisper(audio, None if lang == "Auto-detect" else lang),
+                    inputs=[whisper_audio, whisper_language],
+                    outputs=[whisper_output]
+                )
+# Create the Gradio interface
+def create_interface():
+    """Create the main Gradio interface"""
+    with gr.Blocks(title="TTS & STT Gallery") as demo:
+        gr.Markdown("# TTS & STT Model Gallery")
+        gr.Markdown("Explore different Text-to-Speech and Speech-to-Text models")
+        with gr.Tabs():
+            create_tts_tab()
+            create_stt_tab()
+    return demo
+# Launch the app
 if __name__ == "__main__":
+    demo = create_interface()
+    demo.launch()

src/models/__init__.py ADDED Viewed

	@@ -0,0 +1,4 @@

+from .base import TTSModel, STTModel
+from .factory import ModelFactory
+__all__ = ['TTSModel', 'STTModel', 'ModelFactory']

src/models/base.py ADDED Viewed

	@@ -0,0 +1,77 @@

+from abc import ABC, abstractmethod
+import tempfile
+import os
+class BaseModel(ABC):
+    """Base abstract class for all models"""
+    @property
+    @abstractmethod
+    def name(self):
+        """Return the name of the model"""
+        pass
+    @property
+    @abstractmethod
+    def description(self):
+        """Return the description of the model"""
+        pass
+    @abstractmethod
+    def initialize(self):
+        """Initialize the model"""
+        pass
+class TTSModel(BaseModel):
+    """Abstract base class for Text-to-Speech models"""
+    @abstractmethod
+    def generate_speech(self, text, **kwargs):
+        """
+        Generate speech from text
+        Args:
+            text (str): Text to convert to speech
+            **kwargs: Additional model-specific parameters
+        Returns:
+            str: Path to the generated audio file
+        """
+        pass
+    def supports_voice_cloning(self):
+        """Whether the model supports voice cloning"""
+        return False
+    def supports_multilingual(self):
+        """Whether the model supports multiple languages"""
+        return False
+    def get_supported_languages(self):
+        """Get list of supported languages"""
+        return ["English"]
+class STTModel(BaseModel):
+    """Abstract base class for Speech-to-Text models"""
+    @abstractmethod
+    def transcribe(self, audio_path, **kwargs):
+        """
+        Transcribe speech to text
+        Args:
+            audio_path (str): Path to the audio file
+            **kwargs: Additional model-specific parameters
+        Returns:
+            str: Transcribed text
+        """
+        pass
+    def supports_multilingual(self):
+        """Whether the model supports multiple languages"""
+        return False
+    def get_supported_languages(self):
+        """Get list of supported languages"""
+        return ["English"]

src/models/factory.py ADDED Viewed

	@@ -0,0 +1,54 @@

+from .tts.chatterbox_model import ChatterboxTTSModel
+from .tts.kitten_model import KittenTTSModel
+from .tts.piper_model import PiperTTSModel
+from .tts.kokoro_model import KokoroTTSModel
+from .tts.dia_model import DiaTTSModel
+from .stt.whisper_model import FasterWhisperSTTModel
+class ModelFactory:
+    """Factory class for creating model instances"""
+    @staticmethod
+    def get_tts_models():
+        """Get all available TTS models"""
+        return {
+            "ResembleAI/chatterbox": ChatterboxTTSModel(),
+            "KittenML/KittenTTS": KittenTTSModel(),
+            "piper-tts": PiperTTSModel(),
+            "hexgrad/kokoro": KokoroTTSModel(),
+            "nari-labs/Dia-1.6B": DiaTTSModel()
+        }
+    @staticmethod
+    def get_stt_models():
+        """Get all available STT models"""
+        return {
+            "SYSTRAN/faster-whisper": FasterWhisperSTTModel()
+        }
+    @staticmethod
+    def get_tts_model(model_name):
+        """Get a specific TTS model by name"""
+        models = ModelFactory.get_tts_models()
+        return models.get(model_name)
+    @staticmethod
+    def get_stt_model(model_name):
+        """Get a specific STT model by name"""
+        models = ModelFactory.get_stt_models()
+        return models.get(model_name)
+    @staticmethod
+    def get_model_descriptions():
+        """Get descriptions for all models"""
+        descriptions = {}
+        # Add TTS model descriptions
+        for model_name, model in ModelFactory.get_tts_models().items():
+            descriptions[model_name] = model.description
+        # Add STT model descriptions
+        for model_name, model in ModelFactory.get_stt_models().items():
+            descriptions[model_name] = model.description
+        return descriptions

src/models/stt/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ from .whisper_model import FasterWhisperSTTModel
2	+
3	+ __all__ = ['FasterWhisperSTTModel']

src/models/stt/whisper_model.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import torch
+from faster_whisper import WhisperModel
+from ..base import STTModel
+class FasterWhisperSTTModel(STTModel):
+    """Faster Whisper STT model implementation"""
+    def __init__(self):
+        self._model = None
+        self._initialized = False
+        self._model_size = "large-v3"
+    @property
+    def name(self):
+        return "SYSTRAN/faster-whisper"
+    @property
+    def description(self):
+        return "Faster Whisper transcription with CTranslate2, up to 4x faster than OpenAI Whisper"
+    def initialize(self):
+        """Initialize the Faster Whisper model"""
+        if self._initialized:
+            return True
+        try:
+            if torch.cuda.is_available():
+                self._model = WhisperModel(self._model_size, device="cuda", compute_type="float16")
+                print("Loaded faster-whisper on CUDA with FP16")
+            elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
+                # MPS (Apple Silicon) support
+                self._model = WhisperModel(self._model_size, device="cpu", compute_type="int8")
+                print("Loaded faster-whisper on CPU with INT8 (MPS not directly supported)")
+            else:
+                self._model = WhisperModel(self._model_size, device="cpu", compute_type="int8")
+                print("Loaded faster-whisper on CPU with INT8")
+            self._initialized = True
+            return True
+        except Exception as e:
+            print(f"Error initializing Faster Whisper model: {str(e)}")
+            print("Falling back to small model with INT8 quantization")
+            try:
+                self._model = WhisperModel("small", device="cpu", compute_type="int8")
+                self._initialized = True
+                return True
+            except Exception as e2:
+                print(f"Failed to load fallback model: {str(e2)}")
+                return False
+    def transcribe(self, audio_path, language=None, **kwargs):
+        """
+        Transcribe speech to text
+        Args:
+            audio_path (str): Path to the audio file
+            language (str, optional): Language code for transcription
+            **kwargs: Additional parameters for transcription
+        Returns:
+            str: Transcribed text
+        """
+        if not self._initialized:
+            if not self.initialize():
+                raise RuntimeError("Failed to initialize Faster Whisper model")
+        # Set default transcription parameters
+        transcribe_kwargs = {
+            "beam_size": 5,
+            "language": language,
+            "task": "transcribe"
+        }
+        # Update with any user-provided kwargs
+        transcribe_kwargs.update(kwargs)
+        # Transcribe audio
+        segments, info = self._model.transcribe(audio_path, **transcribe_kwargs)
+        # Combine all segments into a single text
+        transcription = " ".join([segment.text for segment in segments])
+        return transcription.strip()
+    def supports_multilingual(self):
+        return True
+    def get_supported_languages(self):
+        # Whisper supports many languages, but we'll return a subset of common ones
+        return [
+            "English", "Spanish", "French", "German", "Chinese", "Japanese",
+            "Russian", "Portuguese", "Italian", "Dutch", "Arabic", "Korean"
+        ]

src/models/tts/__init__.py ADDED Viewed

	@@ -0,0 +1,13 @@

+from .chatterbox_model import ChatterboxTTSModel
+from .kitten_model import KittenTTSModel
+from .piper_model import PiperTTSModel
+from .kokoro_model import KokoroTTSModel
+from .dia_model import DiaTTSModel
+__all__ = [
+    'ChatterboxTTSModel',
+    'KittenTTSModel',
+    'PiperTTSModel',
+    'KokoroTTSModel',
+    'DiaTTSModel'
+]

src/models/tts/chatterbox_model.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import torch
+import torchaudio as ta
+import tempfile
+import os
+from chatterbox.mtl_tts import ChatterboxMultilingualTTS
+from ..base import TTSModel
+class ChatterboxTTSModel(TTSModel):
+    """Chatterbox multilingual TTS model implementation"""
+    def __init__(self):
+        self._model = None
+        self._initialized = False
+    @property
+    def name(self):
+        return "ResembleAI/chatterbox"
+    @property
+    def description(self):
+        return "Industrial-grade TTS solution with multilingual support"
+    def initialize(self):
+        """Initialize the Chatterbox model"""
+        if self._initialized:
+            return True
+        try:
+            self._model = ChatterboxMultilingualTTS.from_pretrained(
+                device="cuda" if torch.cuda.is_available() else "cpu"
+            )
+            self._initialized = True
+            return True
+        except RuntimeError as e:
+            if "Attempting to deserialize object on a CUDA device" in str(e):
+                print("CUDA model detected but CUDA is not available. Loading model on CPU...")
+                self._model = ChatterboxMultilingualTTS.from_pretrained(device="cpu")
+                self._initialized = True
+                return True
+            else:
+                print(f"Error initializing Chatterbox model: {e}")
+                return False
+    def generate_speech(self, text, language="English", audio_prompt=None, **kwargs):
+        """
+        Generate speech from text using Chatterbox multilingual TTS
+        Args:
+            text (str): Text to convert to speech
+            language (str): Language name ('English' or 'Chinese')
+            audio_prompt (str, optional): Path to reference audio file for voice cloning
+            **kwargs: Additional parameters for generation
+        Returns:
+            str: Path to the generated audio file
+        """
+        if not self._initialized:
+            if not self.initialize():
+                raise RuntimeError("Failed to initialize Chatterbox model")
+        # Map language names to language codes
+        language_map = {
+            "English": "en",
+            "Chinese": "zh"
+        }
+        language_id = language_map.get(language, "en")
+        # Default generation parameters
+        generate_kwargs = {
+            "exaggeration": 0.5,
+            "temperature": 0.8,
+            "cfg_weight": 0.3,
+        }
+        # Update with any user-provided kwargs
+        generate_kwargs.update(kwargs)
+        # Generate speech using Chatterbox
+        if audio_prompt and os.path.exists(audio_prompt):
+            # Use audio prompt for voice cloning
+            wav = self._model.generate(text, language_id=language_id, audio_prompt_path=audio_prompt, **generate_kwargs)
+        else:
+            # Generate without audio prompt (default voice)
+            wav = self._model.generate(text, language_id=language_id, **generate_kwargs)
+        # Save to a temporary file
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
+            ta.save(tmp_file.name, wav, self._model.sr)
+            return tmp_file.name
+    def supports_voice_cloning(self):
+        return True
+    def supports_multilingual(self):
+        return True
+    def get_supported_languages(self):
+        return ["English", "Chinese"]

src/models/tts/dia_model.py ADDED Viewed

	@@ -0,0 +1,56 @@

+import tempfile
+import os
+from ..base import TTSModel
+class DiaTTSModel(TTSModel):
+    """Dia TTS model implementation"""
+    def __init__(self):
+        self._model = None
+        self._initialized = False
+    @property
+    def name(self):
+        return "nari-labs/Dia-1.6B"
+    @property
+    def description(self):
+        return "Ultra-realistic dialogue generation with support for voice cloning and non-verbal expressions"
+    def initialize(self):
+        """Initialize the Dia model"""
+        if self._initialized:
+            return True
+        try:
+            # Import here to avoid circular imports
+            from src.dia_tts import DiaTTS
+            self._model = DiaTTS()
+            self._initialized = True
+            return True
+        except Exception as e:
+            print(f"Error initializing Dia model: {e}")
+            return False
+    def generate_speech(self, text, audio_prompt=None, **kwargs):
+        """
+        Generate speech from text using Dia TTS
+        Args:
+            text (str): Text to convert to speech
+            audio_prompt (str, optional): Path to reference audio file for voice cloning
+            **kwargs: Additional parameters for generation
+        Returns:
+            str: Path to the generated audio file
+        """
+        if not self._initialized:
+            if not self.initialize():
+                raise RuntimeError("Failed to initialize Dia model")
+        # Generate speech using Dia
+        output_path = self._model.generate(text, reference_audio=audio_prompt, **kwargs)
+        return output_path
+    def supports_voice_cloning(self):
+        return True

src/models/tts/kitten_model.py ADDED Viewed

	@@ -0,0 +1,67 @@

+import tempfile
+import os
+import soundfile as sf
+import numpy as np
+from kittentts import KittenTTS
+from ..base import TTSModel
+class KittenTTSModel(TTSModel):
+    """KittenTTS model implementation"""
+    def __init__(self):
+        self._model = None
+        self._initialized = False
+        self._model_path = "KittenML/kitten-tts-nano-0.2"
+    @property
+    def name(self):
+        return "KittenML/KittenTTS"
+    @property
+    def description(self):
+        return "High-quality TTS with voice cloning capabilities using reference audio"
+    def initialize(self):
+        """Initialize the KittenTTS model"""
+        if self._initialized:
+            return True
+        try:
+            self._model = KittenTTS(self._model_path)
+            self._initialized = True
+            return True
+        except Exception as e:
+            print(f"Error initializing KittenTTS model: {e}")
+            return False
+    def generate_speech(self, text, audio_prompt=None, **kwargs):
+        """
+        Generate speech from text using KittenTTS
+        Args:
+            text (str): Text to convert to speech
+            audio_prompt (str, optional): Path to reference audio file for voice cloning
+            **kwargs: Additional parameters for generation
+        Returns:
+            str: Path to the generated audio file
+        """
+        if not self._initialized:
+            if not self.initialize():
+                raise RuntimeError("Failed to initialize KittenTTS model")
+        # Generate speech using KittenTTS
+        if audio_prompt and os.path.exists(audio_prompt):
+            # Use audio prompt for voice cloning
+            audio_array = self._model.generate_with_voice(text, audio_prompt)
+        else:
+            # Generate with default voice
+            audio_array = self._model.generate(text)
+        # Save to a temporary file
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
+            sf.write(tmp_file.name, audio_array, self._model.sample_rate)
+            return tmp_file.name
+    def supports_voice_cloning(self):
+        return True

src/models/tts/kokoro_model.py ADDED Viewed

	@@ -0,0 +1,69 @@

+import tempfile
+import os
+from kokoro import KPipeline
+from ..base import TTSModel
+class KokoroTTSModel(TTSModel):
+    """Kokoro TTS model implementation"""
+    def __init__(self):
+        self._model = None
+        self._initialized = False
+        self._lang_code = 'a'  # Default to American English
+    @property
+    def name(self):
+        return "hexgrad/kokoro"
+    @property
+    def description(self):
+        return "Lightweight TTS model with 82M parameters, Apache-licensed for production and personal use"
+    def initialize(self):
+        """Initialize the Kokoro model"""
+        if self._initialized:
+            return True
+        try:
+            self._model = KPipeline(lang_code=self._lang_code)
+            self._initialized = True
+            return True
+        except Exception as e:
+            print(f"Error initializing Kokoro model: {e}")
+            return False
+    def generate_speech(self, text, lang_code=None, **kwargs):
+        """
+        Generate speech from text using Kokoro TTS
+        Args:
+            text (str): Text to convert to speech
+            lang_code (str, optional): Language code ('a' for American English, 'b' for British English)
+            **kwargs: Additional parameters for generation
+        Returns:
+            str: Path to the generated audio file
+        """
+        # Update language code if provided
+        if lang_code and lang_code != self._lang_code:
+            self._lang_code = lang_code
+            self._initialized = False
+        if not self._initialized:
+            if not self.initialize():
+                raise RuntimeError("Failed to initialize Kokoro model")
+        # Generate speech
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
+            self._model.tts_to_file(text, tmp_file.name)
+            return tmp_file.name
+    def get_supported_languages(self):
+        return ["American English", "British English"]
+    def get_language_codes(self):
+        """Get mapping of language names to language codes"""
+        return {
+            "American English": "a",
+            "British English": "b"
+        }

src/models/tts/piper_model.py ADDED Viewed

	@@ -0,0 +1,115 @@

+import os
+import tempfile
+from piper import PiperVoice
+from ..base import TTSModel
+class PiperTTSModel(TTSModel):
+    """Piper TTS model implementation"""
+    def __init__(self):
+        self._voices_by_lang = None
+        self._initialized = False
+    @property
+    def name(self):
+        return "piper-tts"
+    @property
+    def description(self):
+        return "Local on-device TTS with dynamic English and Chinese voice selection from Piper models"
+    def initialize(self):
+        """Initialize the Piper model by scanning available voices"""
+        if self._initialized:
+            return True
+        try:
+            self._voices_by_lang = self._scan_piper_voices()
+            self._initialized = True
+            return True
+        except Exception as e:
+            print(f"Error initializing Piper model: {e}")
+            return False
+    def _scan_piper_voices(self):
+        """Scan available Piper voices"""
+        voices_dir = "src/voices/piper_voices"
+        voices_by_lang = {'English': {}, 'Chinese': {}}
+        # Chinese: only huayan medium
+        chinese_path = os.path.join(voices_dir, "zh", "zh_CN", "huayan", "medium", "zh_CN-huayan-medium.onnx")
+        if os.path.exists(chinese_path):
+            voices_by_lang['Chinese']['huayan (zh_CN)'] = chinese_path
+        # English voices
+        en_dir = os.path.join(voices_dir, "en")
+        for root, dirs, files in os.walk(en_dir):
+            if len(root.split(os.sep)) < 5:  # Skip if not deep enough
+                continue
+            parts = root.split(os.sep)
+            if len(parts) >= 5 and parts[-1] in ['medium', 'high']:
+                locale = parts[-3]  # en_GB or en_US
+                voice_name = parts[-2]  # alan, etc.
+                quality = parts[-1]  # medium or high
+                for file in files:
+                    if file.endswith('.onnx') and f"{locale}-{voice_name}-{quality}" in file:
+                        path = os.path.join(root, file)
+                        label = f"{voice_name} ({locale})"
+                        # Prefer medium over high
+                        if quality == 'medium' or label not in voices_by_lang['English']:
+                            voices_by_lang['English'][label] = path
+                        break  # Assume one .onnx per dir
+        return voices_by_lang
+    def generate_speech(self, text, language="English", voice=None, **kwargs):
+        """
+        Generate speech from text using Piper TTS
+        Args:
+            text (str): Text to convert to speech
+            language (str): Language name ('English' or 'Chinese')
+            voice (str, optional): Voice name to use
+            **kwargs: Additional parameters for generation
+        Returns:
+            str: Path to the generated audio file
+        """
+        if not self._initialized:
+            if not self.initialize():
+                raise RuntimeError("Failed to initialize Piper model")
+        # Get available voices for the selected language
+        available_voices = self._voices_by_lang.get(language, {})
+        if not available_voices:
+            raise ValueError(f"No voices available for language: {language}")
+        # If voice not specified or not available, use the first available voice
+        if not voice or voice not in available_voices:
+            voice = next(iter(available_voices.keys()))
+        # Get the model path for the selected voice
+        model_path = available_voices[voice]
+        # Create a PiperVoice instance for the selected voice
+        piper_voice = PiperVoice(model_path=model_path)
+        # Generate speech
+        with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp_file:
+            piper_voice.synthesize(text, tmp_file.name)
+            return tmp_file.name
+    def supports_multilingual(self):
+        return True
+    def get_supported_languages(self):
+        if not self._initialized:
+            self.initialize()
+        return list(self._voices_by_lang.keys())
+    def get_available_voices(self, language="English"):
+        """Get available voices for a specific language"""
+        if not self._initialized:
+            self.initialize()
+        return list(self._voices_by_lang.get(language, {}).keys())