Spaces:

DroolingPanda
/

tts_gallery

Build error

App Files Files Community

Michael Hu commited on Sep 29, 2025

Commit

b68dcac

1 Parent(s): 8829e6c

feat: add support for Dia-1.6B TTS model in TTS Gallery

Browse files

Files changed (3) hide show

app.py +55 -0
requirements.txt +4 -2
src/dia_tts.py +82 -0

app.py CHANGED Viewed

@@ -16,6 +16,7 @@ import wave
 import os
 from faster_whisper import WhisperModel
 from kokoro import KPipeline
 # Model descriptions for better understanding
 MODEL_DESCRIPTIONS = {
@@ -24,6 +25,7 @@ MODEL_DESCRIPTIONS = {
     "piper-tts": "Local on-device TTS with dynamic English and Chinese voice selection from Piper models",
     "SYSTRAN/faster-whisper": "Faster Whisper transcription with CTranslate2, up to 4x faster than OpenAI Whisper",
     "hexgrad/kokoro": "Lightweight TTS model with 82M parameters, Apache-licensed for production and personal use",
 }
 # Models dictionary
@@ -33,6 +35,7 @@ MODELS = {
     "piper-tts": "Piper (no voice cloning)",
     "SYSTRAN/faster-whisper": "Faster Whisper",
     "hexgrad/kokoro": "Kokoro-82M",
 }
 original_torch_load = torch.load
@@ -93,6 +96,18 @@ voices_by_lang = scan_piper_voices()
 # No global piper_voice, load dynamically
 # Initialize Kokoro
 def initialize_kokoro():
     try:
@@ -235,6 +250,25 @@ def generate_kokoro_speech(text, language_code, voice_name):
     except Exception as e:
         return None, f"Error synthesizing speech: {str(e)}"
 def generate_piper_speech(text, lang, voice):
     """
     Generate speech from text using Piper TTS with selected voice
@@ -398,6 +432,20 @@ with gr.Blocks(css=custom_css, title="🎙️ TTS Model Gallery", theme=gr.theme
             piper_language_selection = gr.Radio(
                 choices=["English", "Chinese"],
                 value="English",
                 label="Language"
             )
             piper_voice_selection = gr.Dropdown(
@@ -504,6 +552,13 @@ with gr.Blocks(css=custom_css, title="🎙️ TTS Model Gallery", theme=gr.theme
         outputs=kittentts_audio_output
     )
     # Connect the Piper generate button to the function
     piper_generate_btn.click(
         fn=generate_piper_speech,

 import os
 from faster_whisper import WhisperModel
 from kokoro import KPipeline
+from src.dia_tts import DiaTTS
 # Model descriptions for better understanding
 MODEL_DESCRIPTIONS = {
     "piper-tts": "Local on-device TTS with dynamic English and Chinese voice selection from Piper models",
     "SYSTRAN/faster-whisper": "Faster Whisper transcription with CTranslate2, up to 4x faster than OpenAI Whisper",
     "hexgrad/kokoro": "Lightweight TTS model with 82M parameters, Apache-licensed for production and personal use",
+    "nari-labs/Dia-1.6B": "Ultra-realistic dialogue generation with support for voice cloning and non-verbal expressions",
 }
 # Models dictionary
     "piper-tts": "Piper (no voice cloning)",
     "SYSTRAN/faster-whisper": "Faster Whisper",
     "hexgrad/kokoro": "Kokoro-82M",
+    "nari-labs/Dia-1.6B": "Dia TTS",
 }
 original_torch_load = torch.load
 # No global piper_voice, load dynamically
+# Initialize Dia model
+dia_model = None
+def initialize_dia():
+    global dia_model
+    try:
+        dia_model = DiaTTS()
+        print("Loaded Dia-1.6B model")
+        return dia_model
+    except Exception as e:
+        print(f"Error loading Dia model: {e}")
+        return None
 # Initialize Kokoro
 def initialize_kokoro():
     try:
     except Exception as e:
         return None, f"Error synthesizing speech: {str(e)}"
+def generate_dia_speech(text, audio_prompt=None):
+    """
+    Generate speech from text using Dia TTS with optional audio prompt
+    Args:
+        text (str): Text to convert to speech
+        audio_prompt (str, optional): Path to reference audio file for voice cloning
+    Returns:
+        str: Path to the generated audio file
+    """
+    # Initialize Dia model if not already initialized
+    global dia_model
+    if dia_model is None:
+        dia_model = initialize_dia()
+    # Generate speech using Dia
+    return dia_model.generate_to_file(text, audio_prompt)
 def generate_piper_speech(text, lang, voice):
     """
     Generate speech from text using Piper TTS with selected voice
             piper_language_selection = gr.Radio(
                 choices=["English", "Chinese"],
                 value="English",
+    # Dia TTS UI
+    dia_model_info = gr.HTML(create_model_card("nari-labs/Dia-1.6B"))
+    with gr.Row():
+        with gr.Column():
+            dia_text_format = gr.Markdown("""
+            **Tip:** For dialogue, use [S1] and [S2] tags. For non-verbal expressions, use (laughs), (sighs), etc.
+            Example: [S1] Hello there! (laughs) [S2] Hi, how are you doing today?
+            """)
+            dia_generate_btn = gr.Button("Generate Speech with Dia")
+        with gr.Column():
+            dia_audio_output = gr.Audio(label="Generated Speech", type="filepath")
                 label="Language"
             )
             piper_voice_selection = gr.Dropdown(
         outputs=kittentts_audio_output
     )
+    # Connect the Dia TTS generate button to the function
+    dia_generate_btn.click(
+        fn=generate_dia_speech,
+        inputs=[text_input, audio_prompt],
+        outputs=dia_audio_output
+    )
     # Connect the Piper generate button to the function
     piper_generate_btn.click(
         fn=generate_piper_speech,

requirements.txt CHANGED Viewed

@@ -5,8 +5,10 @@ torch
 soundfile
 https://github.com/KittenML/KittenTTS/releases/download/0.1/kittentts-0.1.0-py3-none-any.whl
 piper-tts
-transformers
 accelerate
 faster-whisper
 librosa
-kokoro==0.7.16

 soundfile
 https://github.com/KittenML/KittenTTS/releases/download/0.1/kittentts-0.1.0-py3-none-any.whl
 piper-tts
+transformers>=4.38.0
 accelerate
 faster-whisper
 librosa
+kokoro==0.7.16
+# For Dia TTS model
+git+https://github.com/huggingface/transformers.git

src/dia_tts.py ADDED Viewed

	@@ -0,0 +1,82 @@

+"""
+Dia TTS model integration for TTS Gallery
+Based on: https://github.com/nari-labs/dia/blob/main/hf.py
+"""
+import tempfile
+import torch
+import soundfile as sf
+from transformers import AutoProcessor, DiaForConditionalGeneration
+class DiaTTS:
+    """
+    Wrapper for the Dia TTS model from Nari Labs
+    """
+    def __init__(self, model_checkpoint="nari-labs/Dia-1.6B"):
+        """
+        Initialize the Dia TTS model
+        Args:
+            model_checkpoint (str): HuggingFace model checkpoint to use
+        """
+        self.model_checkpoint = model_checkpoint
+        self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        # Load processor and model
+        self.processor = AutoProcessor.from_pretrained(model_checkpoint)
+        self.model = DiaForConditionalGeneration.from_pretrained(model_checkpoint).to(self.device)
+        # Default generation parameters
+        self.generation_params = {
+            "max_new_tokens": 3072,
+            "guidance_scale": 3.0,
+            "temperature": 1.8,
+            "top_p": 0.90,
+            "top_k": 45
+        }
+    def generate(self, text, audio_prompt=None):
+        """
+        Generate speech from text using Dia
+        Args:
+            text (str): Text to convert to speech. Should use [S1] and [S2] tags for dialogue.
+            audio_prompt (str, optional): Path to reference audio file for voice cloning
+        Returns:
+            numpy.ndarray: Generated audio as a numpy array
+            int: Sample rate (44100)
+        """
+        # Format text with speaker tags if not already present
+        if not text.startswith("[S1]") and not text.startswith("[S2]"):
+            text = f"[S1] {text}"
+        # Prepare inputs
+        inputs = self.processor(text=[text], padding=True, return_tensors="pt").to(self.device)
+        # Generate audio
+        outputs = self.model.generate(**inputs, **self.generation_params)
+        # Decode outputs
+        audio_data = self.processor.batch_decode(outputs)
+        # Return audio data (assuming it's a numpy array) and sample rate
+        return audio_data[0], 44100  # Dia uses 44.1kHz sample rate
+    def generate_to_file(self, text, audio_prompt=None):
+        """
+        Generate speech from text and save to a temporary file
+        Args:
+            text (str): Text to convert to speech
+            audio_prompt (str, optional): Path to reference audio file for voice cloning
+        Returns:
+            str: Path to the generated audio file
+        """
+        audio_data, sample_rate = self.generate(text, audio_prompt)
+        # Save to a temporary file
+        with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as tmp_file:
+            sf.write(tmp_file.name, audio_data, sample_rate)
+            return tmp_file.name