Spaces:

Borio047
/

DG-TTS

Sleeping

App Files Files Community

Borio047 commited on Dec 1, 2025

Commit

c23ecfc

verified ·

1 Parent(s): c8ea599

Update app.py

Browse files

Files changed (1) hide show

app.py +49 -27

app.py CHANGED Viewed

@@ -1,64 +1,87 @@
 import gradio as gr
-from transformers import pipeline
 import numpy as np
 import soundfile as sf
 import os
 import uuid
-# Load TTS pipeline once at startup
-TTS_MODEL_ID = "suno/bark-small"
-tts = pipeline("text-to-speech", model=TTS_MODEL_ID)
 def generate_speech(text: str) -> str:
     """
-    Takes input text and returns a filepath to a WAV file
-    for gr.Audio(type="filepath").
     """
     if not text or text.strip() == "":
-        raise gr.Error("Please enter some text to synthesize 🙂")
-    # Run the model
-    output = tts(text)
-    # Expecting {"audio": np.ndarray or list, "sampling_rate": int}
-    audio = np.asarray(output["audio"], dtype=np.float32)
-    sr = int(output["sampling_rate"])
-    # Ensure mono or stereo is fine; soundfile can handle it
-    if audio.ndim > 1:
-        audio = audio.squeeze()
-    # Create a unique temporary path
     tmp_dir = "/tmp"
     os.makedirs(tmp_dir, exist_ok=True)
     filename = f"tts_{uuid.uuid4().hex}.wav"
     filepath = os.path.join(tmp_dir, filename)
-    # Write WAV using soundfile (no pydub, no wave header issues)
-    sf.write(filepath, audio, sr)
-    # Return the path; gr.Audio(type="filepath") will use it directly
     return filepath
 with gr.Blocks() as demo:
-    gr.Markdown("# 🗣️ Simple Text-to-Speech Demo (Bark Small)")
     gr.Markdown(
-        "Type some English text, click **Generate speech**, and listen to the audio.\n"
-        "Model: `suno/bark-small` via 🤗 Transformers TTS pipeline."
     )
     with gr.Row():
         with gr.Column(scale=2):
             text_input = gr.Textbox(
-                label="Input text",
-                placeholder="Type something like: Hello, this is my first TTS Space!",
-                lines=4,
             )
             generate_button = gr.Button("Generate speech", variant="primary")
         with gr.Column(scale=1):
             audio_output = gr.Audio(
                 label="Generated audio",
-                type="filepath",  # we are returning a path string
             )
     generate_button.click(
@@ -68,5 +91,4 @@ with gr.Blocks() as demo:
     )
 if __name__ == "__main__":
-    # Disable SSR to avoid async quirks
     demo.launch(ssr_mode=False)

 import gradio as gr
 import numpy as np
 import soundfile as sf
 import os
 import uuid
+import torch
+from transformers import VitsModel, VitsTokenizer, set_seed
+# 1. Load MMS-TTS English model (lighter than Bark)
+MODEL_ID = "facebook/mms-tts-eng"
+tokenizer = VitsTokenizer.from_pretrained(MODEL_ID)
+model = VitsModel.from_pretrained(MODEL_ID)
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+model = model.to(device)
+# Optional: make outputs deterministic
+set_seed(555)
+MAX_CHARS = 150  # keep text short for speed and stability
 def generate_speech(text: str) -> str:
     """
+    Take text, synthesize speech with MMS-TTS,
+    save to a WAV file, and return the filepath
+    (for gr.Audio(type="filepath")).
     """
     if not text or text.strip() == "":
+        raise gr.Error("Please enter some text 🙂")
+    text = text.strip()
+    if len(text) > MAX_CHARS:
+        text = text[:MAX_CHARS]
+        # You could also show a warning text if you like.
+    # MMS-TTS is trained on lowercased, unpunctuated text → simple normalization
+    normalized_text = text.lower()
+    # 1) Tokenize
+    inputs = tokenizer(text=normalized_text, return_tensors="pt").to(device)
+    # 2) Forward pass
+    with torch.no_grad():
+        outputs = model(**inputs)
+    # 3) Get waveform and sampling rate
+    waveform = outputs.waveform[0].cpu().numpy().astype(np.float32)
+    sr = model.config.sampling_rate  # typically 16000
+    # 4) Save to /tmp as WAV
     tmp_dir = "/tmp"
     os.makedirs(tmp_dir, exist_ok=True)
     filename = f"tts_{uuid.uuid4().hex}.wav"
     filepath = os.path.join(tmp_dir, filename)
+    sf.write(filepath, waveform, sr)
+    # 5) Return file path for gr.Audio(type="filepath")
     return filepath
 with gr.Blocks() as demo:
+    gr.Markdown("# 🗣️ Simple TTS with facebook/mms-tts-eng")
     gr.Markdown(
+        "Type a short English sentence, click **Generate speech**, and listen to the audio.\n\n"
+        "Model: `facebook/mms-tts-eng` (MMS-TTS, VITS-based)."
     )
     with gr.Row():
         with gr.Column(scale=2):
             text_input = gr.Textbox(
+                label="Text to synthesize",
+                placeholder="Example: hello, this is my text-to-speech demo",
+                lines=3,
             )
             generate_button = gr.Button("Generate speech", variant="primary")
         with gr.Column(scale=1):
             audio_output = gr.Audio(
                 label="Generated audio",
+                type="filepath",  # we return a path string
             )
     generate_button.click(
     )
 if __name__ == "__main__":
     demo.launch(ssr_mode=False)