Spaces:

DroolingPanda
/

tts_gallery

Sleeping

App Files Files Community

Michael Hu commited on Sep 29, 2025

Commit

3d5e706

1 Parent(s): 51e5e89

refactor: remove DiaTTS integration and related UI elements

Browse files

Files changed (2) hide show

app.py +47 -47
src/dia_tts.py +73 -73

app.py CHANGED Viewed

@@ -16,7 +16,7 @@ import wave
 import os
 from faster_whisper import WhisperModel
 from kokoro import KPipeline
-from src.dia_tts import DiaTTS
 # Model descriptions for better understanding
 MODEL_DESCRIPTIONS = {
@@ -97,16 +97,16 @@ voices_by_lang = scan_piper_voices()
 # No global piper_voice, load dynamically
 # Initialize Dia model
-dia_model = None
-def initialize_dia():
-    global dia_model
-    try:
-        dia_model = DiaTTS()
-        print("Loaded Dia-1.6B model")
-        return dia_model
-    except Exception as e:
-        print(f"Error loading Dia model: {e}")
-        return None
 # Initialize Kokoro
 def initialize_kokoro():
@@ -250,24 +250,24 @@ def generate_kokoro_speech(text, language_code, voice_name):
     except Exception as e:
         return None, f"Error synthesizing speech: {str(e)}"
-def generate_dia_speech(text, audio_prompt=None):
-    """
-    Generate speech from text using Dia TTS with optional audio prompt
-    Args:
-        text (str): Text to convert to speech
-        audio_prompt (str, optional): Path to reference audio file for voice cloning
-    Returns:
-        str: Path to the generated audio file
-    """
-    # Initialize Dia model if not already initialized
-    global dia_model
-    if dia_model is None:
-        dia_model = initialize_dia()
-    # Generate speech using Dia
-    return dia_model.generate_to_file(text, audio_prompt)
 def generate_piper_speech(text, lang, voice):
     """
@@ -445,19 +445,19 @@ with gr.Blocks(css=custom_css, title="🎙️ TTS Model Gallery", theme=gr.theme
             piper_audio_output = gr.Audio(label="Generated Speech", type="filepath")
             piper_status = gr.Textbox(label="Status", interactive=False)
-    # Dia TTS UI
-    dia_model_info = gr.HTML(create_model_card("nari-labs/Dia-1.6B"))
-    with gr.Row():
-        with gr.Column():
-            dia_text_format = gr.Markdown("""
-            **Tip:** For dialogue, use [S1] and [S2] tags. For non-verbal expressions, use (laughs), (sighs), etc.
-            Example: [S1] Hello there! (laughs) [S2] Hi, how are you doing today?
-            """)
-            dia_generate_btn = gr.Button("Generate Speech with Dia")
-        with gr.Column():
-            dia_audio_output = gr.Audio(label="Generated Speech", type="filepath")
     # Faster Whisper section
     whisper_model_info = gr.HTML(create_model_card("SYSTRAN/faster-whisper"))
@@ -552,12 +552,12 @@ with gr.Blocks(css=custom_css, title="🎙️ TTS Model Gallery", theme=gr.theme
         outputs=kittentts_audio_output
     )
-    # Connect the Dia TTS generate button to the function
-    dia_generate_btn.click(
-        fn=generate_dia_speech,
-        inputs=[text_input, audio_prompt],
-        outputs=dia_audio_output
-    )
     # Connect the Piper generate button to the function
     piper_generate_btn.click(

 import os
 from faster_whisper import WhisperModel
 from kokoro import KPipeline
+# from src.dia_tts import DiaTTS
 # Model descriptions for better understanding
 MODEL_DESCRIPTIONS = {
 # No global piper_voice, load dynamically
 # Initialize Dia model
+# dia_model = None
+# def initialize_dia():
+#     global dia_model
+#     try:
+#         dia_model = DiaTTS()
+#         print("Loaded Dia-1.6B model")
+#         return dia_model
+#     except Exception as e:
+#         print(f"Error loading Dia model: {e}")
+#         return None
 # Initialize Kokoro
 def initialize_kokoro():
     except Exception as e:
         return None, f"Error synthesizing speech: {str(e)}"
+# def generate_dia_speech(text, audio_prompt=None):
+#     """
+#     Generate speech from text using Dia TTS with optional audio prompt
+#
+#     Args:
+#         text (str): Text to convert to speech
+#         audio_prompt (str, optional): Path to reference audio file for voice cloning
+#
+#     Returns:
+#         str: Path to the generated audio file
+#     """
+#     # Initialize Dia model if not already initialized
+#     global dia_model
+#     if dia_model is None:
+#         dia_model = initialize_dia()
+#
+#     # Generate speech using Dia
+#     return dia_model.generate_to_file(text, audio_prompt)
 def generate_piper_speech(text, lang, voice):
     """
             piper_audio_output = gr.Audio(label="Generated Speech", type="filepath")
             piper_status = gr.Textbox(label="Status", interactive=False)
+    # Dia TTS UI (commented out for now)
+    # dia_model_info = gr.HTML(create_model_card("nari-labs/Dia-1.6B"))
+    # with gr.Row():
+    #     with gr.Column():
+    #         dia_text_format = gr.Markdown("""
+    #         **Tip:** For dialogue, use [S1] and [S2] tags. For non-verbal expressions, use (laughs), (sighs), etc.
+    #         Example: [S1] Hello there! (laughs) [S2] Hi, how are you doing today?
+    #         """)
+    #         dia_generate_btn = gr.Button("Generate Speech with Dia")
+    #
+    #     with gr.Column():
+    #         dia_audio_output = gr.Audio(label="Generated Speech", type="filepath")
     # Faster Whisper section
     whisper_model_info = gr.HTML(create_model_card("SYSTRAN/faster-whisper"))
         outputs=kittentts_audio_output
     )
+    # Connect the Dia TTS generate button to the function (commented out for now)
+    # dia_generate_btn.click(
+    #     fn=generate_dia_speech,
+    #     inputs=[text_input, audio_prompt],
+    #     outputs=dia_audio_output
+    # )
     # Connect the Piper generate button to the function
     piper_generate_btn.click(

src/dia_tts.py CHANGED Viewed

@@ -6,77 +6,77 @@ Based on: https://github.com/nari-labs/dia/blob/main/hf.py
 import tempfile
 import torch
 import soundfile as sf
-from transformers import AutoProcessor, DiaForConditionalGeneration
-class DiaTTS:
-    """
-    Wrapper for the Dia TTS model from Nari Labs
-    """
-    def __init__(self, model_checkpoint="nari-labs/Dia-1.6B"):
-        """
-        Initialize the Dia TTS model
-        Args:
-            model_checkpoint (str): HuggingFace model checkpoint to use
-        """
-        self.model_checkpoint = model_checkpoint
-        self.device = "cuda" if torch.cuda.is_available() else "cpu"
-        # Load processor and model
-        self.processor = AutoProcessor.from_pretrained(model_checkpoint)
-        self.model = DiaForConditionalGeneration.from_pretrained(model_checkpoint).to(self.device)
-        # Default generation parameters
-        self.generation_params = {
-            "max_new_tokens": 3072,
-            "guidance_scale": 3.0,
-            "temperature": 1.8,
-            "top_p": 0.90,
-            "top_k": 45
-        }
-    def generate(self, text, audio_prompt=None):
-        """
-        Generate speech from text using Dia
-        Args:
-            text (str): Text to convert to speech. Should use [S1] and [S2] tags for dialogue.
-            audio_prompt (str, optional): Path to reference audio file for voice cloning
-        Returns:
-            numpy.ndarray: Generated audio as a numpy array
-            int: Sample rate (44100)
-        """
-        # Format text with speaker tags if not already present
-        if not text.startswith("[S1]") and not text.startswith("[S2]"):
-            text = f"[S1] {text}"
-        # Prepare inputs
-        inputs = self.processor(text=[text], padding=True, return_tensors="pt").to(self.device)
-        # Generate audio
-        outputs = self.model.generate(**inputs, **self.generation_params)
-        # Decode outputs
-        audio_data = self.processor.batch_decode(outputs)
-        # Return audio data (assuming it's a numpy array) and sample rate
-        return audio_data[0], 44100  # Dia uses 44.1kHz sample rate
-    def generate_to_file(self, text, audio_prompt=None):
-        """
-        Generate speech from text and save to a temporary file
-        Args:
-            text (str): Text to convert to speech
-            audio_prompt (str, optional): Path to reference audio file for voice cloning
-        Returns:
-            str: Path to the generated audio file
-        """
-        audio_data, sample_rate = self.generate(text, audio_prompt)
-        # Save to a temporary file
-        with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as tmp_file:
-            sf.write(tmp_file.name, audio_data, sample_rate)
-            return tmp_file.name

 import tempfile
 import torch
 import soundfile as sf
+# from transformers import AutoProcessor, DiaForConditionalGeneration
+# class DiaTTS:
+#     """
+#     Wrapper for the Dia TTS model from Nari Labs
+#     """
+#     def __init__(self, model_checkpoint="nari-labs/Dia-1.6B"):
+#         """
+#         Initialize the Dia TTS model
+#
+#         Args:
+#             model_checkpoint (str): HuggingFace model checkpoint to use
+#         """
+#         self.model_checkpoint = model_checkpoint
+#         self.device = "cuda" if torch.cuda.is_available() else "cpu"
+#
+#         # Load processor and model
+#         self.processor = AutoProcessor.from_pretrained(model_checkpoint)
+#         self.model = DiaForConditionalGeneration.from_pretrained(model_checkpoint).to(self.device)
+#
+#         # Default generation parameters
+#         self.generation_params = {
+#             "max_new_tokens": 3072,
+#             "guidance_scale": 3.0,
+#             "temperature": 1.8,
+#             "top_p": 0.90,
+#             "top_k": 45
+#         }
+#
+#     def generate(self, text, audio_prompt=None):
+#         """
+#         Generate speech from text using Dia
+#
+#         Args:
+#             text (str): Text to convert to speech. Should use [S1] and [S2] tags for dialogue.
+#             audio_prompt (str, optional): Path to reference audio file for voice cloning
+#
+#         Returns:
+#             numpy.ndarray: Generated audio as a numpy array
+#             int: Sample rate (44100)
+#         """
+#         # Format text with speaker tags if not already present
+#         if not text.startswith("[S1]") and not text.startswith("[S2]"):
+#             text = f"[S1] {text}"
+#
+#         # Prepare inputs
+#         inputs = self.processor(text=[text], padding=True, return_tensors="pt").to(self.device)
+#
+#         # Generate audio
+#         outputs = self.model.generate(**inputs, **self.generation_params)
+#
+#         # Decode outputs
+#         audio_data = self.processor.batch_decode(outputs)
+#
+#         # Return audio data (assuming it's a numpy array) and sample rate
+#         return audio_data[0], 44100  # Dia uses 44.1kHz sample rate
+#
+#     def generate_to_file(self, text, audio_prompt=None):
+#         """
+#         Generate speech from text and save to a temporary file
+#
+#         Args:
+#             text (str): Text to convert to speech
+#             audio_prompt (str, optional): Path to reference audio file for voice cloning
+#
+#         Returns:
+#             str: Path to the generated audio file
+#         """
+#         audio_data, sample_rate = self.generate(text, audio_prompt)
+#
+#         # Save to a temporary file
+#         with tempfile.NamedTemporaryFile(suffix=".mp3", delete=False) as tmp_file:
+#             sf.write(tmp_file.name, audio_data, sample_rate)
+#             return tmp_file.name