Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -29,12 +29,17 @@ def load_model():
|
|
| 29 |
SAMPLE_RATE = 32000
|
| 30 |
|
| 31 |
@spaces.GPU
|
| 32 |
-
def
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 33 |
model = load_model()
|
| 34 |
|
| 35 |
if not text.strip():
|
| 36 |
-
|
| 37 |
-
return
|
| 38 |
|
| 39 |
out = model.infer(
|
| 40 |
text,
|
|
@@ -44,21 +49,10 @@ def tts_stream(text, temperature, top_p, repetition_penalty, state):
|
|
| 44 |
)
|
| 45 |
|
| 46 |
audio_np = out.cpu().numpy()
|
| 47 |
-
|
| 48 |
-
|
| 49 |
-
|
| 50 |
-
def save_audio(state):
|
| 51 |
-
if state is None or len(state) == 0:
|
| 52 |
-
return None
|
| 53 |
-
fd, path = tempfile.mkstemp(suffix=".wav")
|
| 54 |
-
os.close(fd)
|
| 55 |
-
wav_write(path, SAMPLE_RATE, state)
|
| 56 |
-
return path
|
| 57 |
|
| 58 |
|
| 59 |
with gr.Blocks() as demo:
|
| 60 |
-
state_audio = gr.State(None)
|
| 61 |
-
|
| 62 |
with gr.Row():
|
| 63 |
with gr.Column():
|
| 64 |
gr.Markdown("# Soprano Demo\n\nSoprano is an ultra‑lightweight, open‑source text‑to‑speech (TTS) model designed for real‑time, high‑fidelity speech synthesis at unprecedented speed. Soprano can achieve **<15 ms streaming latency** and up to **2000x real-time generation**, all while being easy to deploy at **<1 GB VRAM usage**.\n\nGithub: https://github.com/ekwek1/soprano\n\nModel Weights: https://huggingface.co/ekwek/Soprano-80M")
|
|
@@ -101,16 +95,10 @@ with gr.Blocks() as demo:
|
|
| 101 |
|
| 102 |
|
| 103 |
gen_btn.click(
|
| 104 |
-
fn=
|
| 105 |
-
inputs=[text_in, temperature, top_p, repetition_penalty
|
| 106 |
-
outputs=[audio_out
|
| 107 |
)
|
| 108 |
|
| 109 |
-
#download_btn.click(
|
| 110 |
-
# fn=save_audio,
|
| 111 |
-
# inputs=[state_audio],
|
| 112 |
-
# outputs=[file_out],
|
| 113 |
-
#)
|
| 114 |
-
|
| 115 |
demo.queue()
|
| 116 |
-
demo.launch()
|
|
|
|
| 29 |
SAMPLE_RATE = 32000
|
| 30 |
|
| 31 |
@spaces.GPU
|
| 32 |
+
def tts(text: str, temperature: float = 0.3, top_p: float = 0.95, repetition_penalty: float = 1.2) -> Tuple:
|
| 33 |
+
"""
|
| 34 |
+
Runs Soprano text-to-speech model with the given input text and sampling parameters.
|
| 35 |
+
|
| 36 |
+
Returns:
|
| 37 |
+
(sr, audio) where sr is rthe sample rate (default 32000) and audio is the output audio as an np.ndarray.
|
| 38 |
+
"""
|
| 39 |
model = load_model()
|
| 40 |
|
| 41 |
if not text.strip():
|
| 42 |
+
return None
|
|
|
|
| 43 |
|
| 44 |
out = model.infer(
|
| 45 |
text,
|
|
|
|
| 49 |
)
|
| 50 |
|
| 51 |
audio_np = out.cpu().numpy()
|
| 52 |
+
return (SAMPLE_RATE, audio_np)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
|
| 54 |
|
| 55 |
with gr.Blocks() as demo:
|
|
|
|
|
|
|
| 56 |
with gr.Row():
|
| 57 |
with gr.Column():
|
| 58 |
gr.Markdown("# Soprano Demo\n\nSoprano is an ultra‑lightweight, open‑source text‑to‑speech (TTS) model designed for real‑time, high‑fidelity speech synthesis at unprecedented speed. Soprano can achieve **<15 ms streaming latency** and up to **2000x real-time generation**, all while being easy to deploy at **<1 GB VRAM usage**.\n\nGithub: https://github.com/ekwek1/soprano\n\nModel Weights: https://huggingface.co/ekwek/Soprano-80M")
|
|
|
|
| 95 |
|
| 96 |
|
| 97 |
gen_btn.click(
|
| 98 |
+
fn=tts,
|
| 99 |
+
inputs=[text_in, temperature, top_p, repetition_penalty],
|
| 100 |
+
outputs=[audio_out],
|
| 101 |
)
|
| 102 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 103 |
demo.queue()
|
| 104 |
+
demo.launch(mcp_server=True)
|