Spaces:

Borio047
/

DG-TTS

Sleeping

App Files Files Community

Borio047 commited on Dec 1, 2025

Commit

5e4386d

verified ·

1 Parent(s): 8663e56

Update app.py

Browse files

Files changed (1) hide show

app.py +51 -47

app.py CHANGED Viewed

@@ -1,68 +1,72 @@
 import gradio as gr
-import torch
 from transformers import pipeline
-# 1. Choose a TTS model from Hugging Face
-# This model is for English TTS. You can later swap it for another.
-TTS_MODEL_ID = "facebook/mms-tts-eng"
-# 2. Create the TTS pipeline
-device = 0 if torch.cuda.is_available() else -1
-print(f"Using device: {'cuda' if device == 0 else 'cpu'}")
-try:
-    tts = pipeline("text-to-speech", model=TTS_MODEL_ID, device=device)
-except Exception as e:
-    # If the model can't be loaded, fail early with a clear message
-    raise RuntimeError(f"Failed to load TTS pipeline: {e}")
-def synthesize_tts(text: str):
     """
-    Take text and return (sampling_rate, audio_numpy) for Gradio Audio output.
     """
     if not text or text.strip() == "":
-        raise gr.Error("Please enter some text to synthesize.")
-    try:
-        out = tts(text)
-    except Exception as e:
-        # Show any HF pipeline error nicely in the UI
-        raise gr.Error(f"TTS pipeline error: {e}")
-    # Expecting a dict with 'audio' (numpy array) and 'sampling_rate' (int)
-    if not isinstance(out, dict) or "audio" not in out or "sampling_rate" not in out:
-        raise gr.Error(f"Unexpected TTS output format: {out}")
-    audio = out["audio"]
-    sr = out["sampling_rate"]
-    return (sr, audio)
-title = "Simple Text-to-Speech (TTS) Space"
-description = (
-    "Enter some English text and generate speech using a Hugging Face TTS model. "
-    "Once this works, we can upgrade it to voice cloning (F5-TTS style)."
-)
 with gr.Blocks() as demo:
-    gr.Markdown(f"# {title}")
-    gr.Markdown(description)
     with gr.Row():
-        with gr.Column():
-            text_in = gr.Textbox(
                 lines=4,
-                label="Text to synthesize",
-                placeholder="Type some English text here..."
             )
-            btn = gr.Button("Generate Speech")
-        with gr.Column():
-            # type='numpy' means we can return (sr, numpy_array)
-            audio_out = gr.Audio(label="Generated audio", type="numpy")
-    btn.click(fn=synthesize_tts, inputs=text_in, outputs=audio_out)
-# On Spaces it's fine to launch unconditionally; disable SSR to avoid async quirks
-demo.launch(ssr_mode=False)

 import gradio as gr
 from transformers import pipeline
+import numpy as np
+import soundfile as sf
+import os
+import uuid
+# Load TTS pipeline once at startup
+TTS_MODEL_ID = "suno/bark-small"
+tts = pipeline("text-to-speech", model=TTS_MODEL_ID)
+def generate_speech(text: str) -> str:
     """
+    Takes input text and returns a filepath to a WAV file
+    for gr.Audio(type="filepath").
     """
     if not text or text.strip() == "":
+        raise gr.Error("Please enter some text to synthesize 🙂")
+    # Run the model
+    output = tts(text)
+    # Expecting {"audio": np.ndarray or list, "sampling_rate": int}
+    audio = np.asarray(output["audio"], dtype=np.float32)
+    sr = int(output["sampling_rate"])
+    # Ensure mono or stereo is fine; soundfile can handle it
+    if audio.ndim > 1:
+        audio = audio.squeeze()
+    # Create a unique temporary path
+    tmp_dir = "/tmp"
+    os.makedirs(tmp_dir, exist_ok=True)
+    filename = f"tts_{uuid.uuid4().hex}.wav"
+    filepath = os.path.join(tmp_dir, filename)
+    # Write WAV using soundfile (no pydub, no wave header issues)
+    sf.write(filepath, audio, sr)
+    # Return the path; gr.Audio(type="filepath") will use it directly
+    return filepath
 with gr.Blocks() as demo:
+    gr.Markdown("# 🗣️ Simple Text-to-Speech Demo (Bark Small)")
+    gr.Markdown(
+        "Type some English text, click **Generate speech**, and listen to the audio.\n"
+        "Model: `suno/bark-small` via 🤗 Transformers TTS pipeline."
+    )
     with gr.Row():
+        with gr.Column(scale=2):
+            text_input = gr.Textbox(
+                label="Input text",
+                placeholder="Type something like: Hello, this is my first TTS Space!",
                 lines=4,
             )
+            generate_button = gr.Button("Generate speech", variant="primary")
+        with gr.Column(scale=1):
+            audio_output = gr.Audio(
+                label="Generated audio",
+                type="filepath",  # we are returning a path string
+            )
+    generate_button.click(
+        fn=generate_speech,
+        inputs=text_input,
+        outputs=audio_output,
+    )
+if __name__ == "__main__":
+    # Disable SSR to avoid async quirks
+    demo.launch(ssr_mode=False)