Soprano-TTS

Sleeping

App Files Files Community

ekwek commited on Jan 7

Commit

fa7f144

verified ·

1 Parent(s): 46788cd

Update app.py

Browse files

Files changed (1) hide show

app.py +13 -25

app.py CHANGED Viewed

@@ -29,12 +29,17 @@ def load_model():
 SAMPLE_RATE = 32000
 @spaces.GPU
-def tts_stream(text, temperature, top_p, repetition_penalty, state):
     model = load_model()
     if not text.strip():
-        yield None, state
-        return
     out = model.infer(
         text,
@@ -44,21 +49,10 @@ def tts_stream(text, temperature, top_p, repetition_penalty, state):
     )
     audio_np = out.cpu().numpy()
-    yield (SAMPLE_RATE, audio_np), audio_np
-def save_audio(state):
-    if state is None or len(state) == 0:
-        return None
-    fd, path = tempfile.mkstemp(suffix=".wav")
-    os.close(fd)
-    wav_write(path, SAMPLE_RATE, state)
-    return path
 with gr.Blocks() as demo:
-    state_audio = gr.State(None)
     with gr.Row():
         with gr.Column():
             gr.Markdown("# Soprano Demo\n\nSoprano is an ultra‑lightweight, open‑source text‑to‑speech (TTS) model designed for real‑time, high‑fidelity speech synthesis at unprecedented speed. Soprano can achieve **<15 ms streaming latency** and up to **2000x real-time generation**, all while being easy to deploy at **<1 GB VRAM usage**.\n\nGithub: https://github.com/ekwek1/soprano\n\nModel Weights: https://huggingface.co/ekwek/Soprano-80M")
@@ -101,16 +95,10 @@ with gr.Blocks() as demo:
     gen_btn.click(
-        fn=tts_stream,
-        inputs=[text_in, temperature, top_p, repetition_penalty, state_audio],
-        outputs=[audio_out, state_audio],
     )
-    #download_btn.click(
-    #    fn=save_audio,
-    #    inputs=[state_audio],
-    #    outputs=[file_out],
-    #)
 demo.queue()
-demo.launch()

 SAMPLE_RATE = 32000
 @spaces.GPU
+def tts(text: str, temperature: float = 0.3, top_p: float = 0.95, repetition_penalty: float = 1.2) -> Tuple:
+    """
+    Runs Soprano text-to-speech model with the given input text and sampling parameters.
+    Returns:
+        (sr, audio) where sr is rthe sample rate (default 32000) and audio is the output audio as an np.ndarray.
+    """
     model = load_model()
     if not text.strip():
+        return None
     out = model.infer(
         text,
     )
     audio_np = out.cpu().numpy()
+    return (SAMPLE_RATE, audio_np)
 with gr.Blocks() as demo:
     with gr.Row():
         with gr.Column():
             gr.Markdown("# Soprano Demo\n\nSoprano is an ultra‑lightweight, open‑source text‑to‑speech (TTS) model designed for real‑time, high‑fidelity speech synthesis at unprecedented speed. Soprano can achieve **<15 ms streaming latency** and up to **2000x real-time generation**, all while being easy to deploy at **<1 GB VRAM usage**.\n\nGithub: https://github.com/ekwek1/soprano\n\nModel Weights: https://huggingface.co/ekwek/Soprano-80M")
     gen_btn.click(
+        fn=tts,
+        inputs=[text_in, temperature, top_p, repetition_penalty],
+        outputs=[audio_out],
     )
 demo.queue()
+demo.launch(mcp_server=True)