Soprano-TTS

Sleeping

App Files Files Community

ekwek commited on Jan 10

Commit

ec2e6d6

verified ·

1 Parent(s): 6301f82

Update app.py

Browse files

Files changed (1) hide show

app.py +140 -53

app.py CHANGED Viewed

@@ -1,22 +1,20 @@
 import gradio as gr
 import torch
-import numpy as np
 from soprano import SopranoTTS
-from scipy.io.wavfile import write as wav_write
-import tempfile
-import os
 import spaces
-assert torch.cuda.is_available(), "Demo requires a GPU."
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-print(DEVICE)
 model = None
 def load_model():
     global model
     if model is None:
-        # Load model once
         model = SopranoTTS(
             backend="auto",
             device=DEVICE,
@@ -25,80 +23,169 @@ def load_model():
         )
     return model
 SAMPLE_RATE = 32000
 @spaces.GPU
-def tts(text: str, temperature: float = 0.3, top_p: float = 0.95, repetition_penalty: float = 1.2) -> tuple[int, np.ndarray]:
     """
     Runs Soprano text-to-speech model with the given input text and sampling parameters.
     Returns:
-        (sr, audio) where sr is rthe sample rate (default 32000) and audio is the output audio as an np.ndarray.
     """
-    model = load_model()
     if not text.strip():
-        return None
-    out = model.infer(
-        text,
-        temperature=temperature,
-        top_p=top_p,
-        repetition_penalty=repetition_penalty,
-    )
-    audio_np = out.cpu().numpy()
-    return (SAMPLE_RATE, audio_np)
-with gr.Blocks() as demo:
-    with gr.Row():
-        with gr.Column():
-            gr.Markdown("# Soprano Demo\n\nSoprano is an ultra‑lightweight, open‑source text‑to‑speech (TTS) model designed for real‑time, high‑fidelity speech synthesis at unprecedented speed. Soprano can achieve **<15 ms streaming latency** and up to **2000x real-time generation**, all while being easy to deploy at **<1 GB VRAM usage**.\n\nGithub: https://github.com/ekwek1/soprano\n\nModel Weights: https://huggingface.co/ekwek/Soprano-80M")
-            text_in = gr.Textbox(
-                label="Input Text",
-                placeholder="Enter text to synthesize...",
                 value="Soprano is an extremely lightweight text to speech model designed to produce highly realistic speech at unprecedented speed.",
-                lines=4,
             )
-            with gr.Accordion("Advanced options", open=False):
                 temperature = gr.Slider(
-                    0.0, 1.0, value=0.3, step=0.05, label="Temperature"
                 )
                 top_p = gr.Slider(
-                    0.0, 1.0, value=0.95, step=0.01, label="Top-p"
                 )
                 repetition_penalty = gr.Slider(
-                    1.0, 2.0, value=1.2, step=0.05, label="Repetition penalty"
                 )
-            gen_btn = gr.Button("Generate")
-        with gr.Column():
-            audio_out = gr.Audio(
-                label="Output Audio",
                 autoplay=True,
-                streaming=False,
             )
-            #download_btn = gr.Button("Download")
-            #file_out = gr.File(label="Download file")
-            gr.Markdown(
-                "Usage tips:\n\n"
-                "- Soprano works best when each sentence is between 2 and 15 seconds long.\n"
-                "- Although Soprano recognizes numbers and some special characters, it occasionally mispronounces them. Best results can be achieved by converting these into their phonetic form. (1+1 -> one plus one, etc)\n"
-                "- If Soprano produces unsatisfactory results, you can easily regenerate it for a new, potentially better generation. You may also change the sampling settings for more varied results.\n"
-                "- Avoid improper grammar such as not using contractions, multiple spaces, etc."
             )
-    gen_btn.click(
-        fn=tts,
-        inputs=[text_in, temperature, top_p, repetition_penalty],
-        outputs=[audio_out],
     )
-demo.queue()
-demo.launch(mcp_server=True)

 import gradio as gr
 import torch
 from soprano import SopranoTTS
+import numpy as np
+import socket
+import time
 import spaces
+# Detect device
 DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 model = None
+# Initialize model
+@spaces.GPU
 def load_model():
     global model
     if model is None:
         model = SopranoTTS(
             backend="auto",
             device=DEVICE,
         )
     return model
 SAMPLE_RATE = 32000
 @spaces.GPU
+def generate_speech(
+    text: str,
+    temperature: float = 0.3,
+    top_p: float = 0.95,
+    repetition_penalty: float = 1.2,
+) -> tuple:
     """
     Runs Soprano text-to-speech model with the given input text and sampling parameters.
     Returns:
+       ((sr, audio), status) where sr is the sample rate (default 32000), audio is the output audio as an np.ndarray, and status is the displayed output text.
     """
     if not text.strip():
+        return None, "Please enter some text to generate speech."
+    print(text)
+    try:
+        model = load_model()
+        start_time = time.perf_counter()
+        audio = model.infer(
+            text,
+            temperature=temperature,
+            top_p=top_p,
+            repetition_penalty=repetition_penalty,
+        )
+        gen_time = time.perf_counter() - start_time
+        audio_np = audio.cpu().numpy()
+        audio_int16 = (audio_np * 32767).astype(np.int16)
+        audio_seconds = len(audio_np) / SAMPLE_RATE
+        rtf = audio_seconds / gen_time if gen_time > 0 else float("inf")
+        status = (
+            f"✓ Generated {audio_seconds:.2f} s audio | "
+            f"Generation time: {gen_time:.3f} s "
+            f"({rtf:.2f}x realtime)"
+        )
+        return (SAMPLE_RATE, audio_int16), status
+    except Exception as e:
+        return None, f"✗ Error: {str(e)}"
+# Create Gradio interface
+with gr.Blocks(title="Soprano TTS") as demo:
+    gr.Markdown(
+        f"""
+# 🗣️ Soprano TTS
+**Running on: {DEVICE.upper()}**
+Soprano is an ultra-lightweight, open-source text-to-speech (TTS) model designed for real-time,
+high-fidelity speech synthesis at unprecedented speed. Soprano can achieve **<15 ms streaming latency**
+and up to **2000x real-time generation**, all while being easy to deploy at **<1 GB VRAM usage**.
+**GitHub:** https://github.com/ekwek1/soprano
+**Model Demo:** https://huggingface.co/spaces/ekwek/Soprano-TTS
+**Model Weights:** https://huggingface.co/ekwek/Soprano-80M
+"""
+    )
+    with gr.Row():
+        with gr.Column(scale=2):
+            text_input = gr.Textbox(
+                label="Text to Synthesize",
+                placeholder="Enter text here...",
                 value="Soprano is an extremely lightweight text to speech model designed to produce highly realistic speech at unprecedented speed.",
+                lines=5,
+                max_lines=10,
             )
+            with gr.Accordion("Advanced Settings", open=False):
                 temperature = gr.Slider(
+                    minimum=0.1,
+                    maximum=1.5,
+                    value=0.3,
+                    step=0.05,
+                    label="Temperature",
                 )
                 top_p = gr.Slider(
+                    minimum=0.5,
+                    maximum=1.0,
+                    value=0.95,
+                    step=0.05,
+                    label="Top P",
                 )
                 repetition_penalty = gr.Slider(
+                    minimum=1.0,
+                    maximum=2.0,
+                    value=1.2,
+                    step=0.1,
+                    label="Repetition Penalty",
                 )
+            generate_btn = gr.Button("Generate Speech", variant="primary", size="lg")
+        with gr.Column(scale=1):
+            audio_output = gr.Audio(
+                label="Generated Speech",
+                type="numpy",
                 autoplay=True,
             )
+            status_output = gr.Textbox(
+                label="Status",
+                interactive=False,
+                lines=3,
+                max_lines=10
             )
+    gr.Examples(
+        examples=[
+            ["Soprano is an extremely lightweight text to speech model.", 0.3, 0.95, 1.2],
+            ["Hello! Welcome to Soprano text to speech.", 0.3, 0.95, 1.2],
+            ["The quick brown fox jumps over the lazy dog.", 0.3, 0.95, 1.2],
+            ["Artificial intelligence is transforming the world.", 0.5, 0.90, 1.2],
+        ],
+        inputs=[text_input, temperature, top_p, repetition_penalty],
+        label="Example Prompts",
+    )
+    generate_btn.click(
+        fn=generate_speech,
+        inputs=[text_input, temperature, top_p, repetition_penalty],
+        outputs=[audio_output, status_output],
+    )
+    gr.Markdown(
+        f"""
+### Usage tips:
+- Soprano works best when each sentence is between 2 and 15 seconds long.
+- Although Soprano recognizes numbers and some special characters, it occasionally mispronounces them.
+  Best results can be achieved by converting these into their phonetic form.
+  (1+1 -> one plus one, etc)
+- If Soprano produces unsatisfactory results, you can easily regenerate it for a new, potentially better generation.
+  You may also change the sampling settings for more varied results.
+- Avoid improper grammar such as not using contractions, multiple spaces, etc.
+"""
+    )
+def main():
+    demo.launch(
+        mcp_server=True,
+        theme=gr.themes.Soft(primary_hue="green"),
+        css="""
+a {
+    color: var(--primary-600);
+}
+a:hover {
+    color: var(--primary-700);
+}
+"""
     )
+if __name__ == "__main__":
+    load_model()
+    main()