ekwek commited on
Commit
fa7f144
·
verified ·
1 Parent(s): 46788cd

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +13 -25
app.py CHANGED
@@ -29,12 +29,17 @@ def load_model():
29
  SAMPLE_RATE = 32000
30
 
31
  @spaces.GPU
32
- def tts_stream(text, temperature, top_p, repetition_penalty, state):
 
 
 
 
 
 
33
  model = load_model()
34
 
35
  if not text.strip():
36
- yield None, state
37
- return
38
 
39
  out = model.infer(
40
  text,
@@ -44,21 +49,10 @@ def tts_stream(text, temperature, top_p, repetition_penalty, state):
44
  )
45
 
46
  audio_np = out.cpu().numpy()
47
- yield (SAMPLE_RATE, audio_np), audio_np
48
-
49
-
50
- def save_audio(state):
51
- if state is None or len(state) == 0:
52
- return None
53
- fd, path = tempfile.mkstemp(suffix=".wav")
54
- os.close(fd)
55
- wav_write(path, SAMPLE_RATE, state)
56
- return path
57
 
58
 
59
  with gr.Blocks() as demo:
60
- state_audio = gr.State(None)
61
-
62
  with gr.Row():
63
  with gr.Column():
64
  gr.Markdown("# Soprano Demo\n\nSoprano is an ultra‑lightweight, open‑source text‑to‑speech (TTS) model designed for real‑time, high‑fidelity speech synthesis at unprecedented speed. Soprano can achieve **<15 ms streaming latency** and up to **2000x real-time generation**, all while being easy to deploy at **<1 GB VRAM usage**.\n\nGithub: https://github.com/ekwek1/soprano\n\nModel Weights: https://huggingface.co/ekwek/Soprano-80M")
@@ -101,16 +95,10 @@ with gr.Blocks() as demo:
101
 
102
 
103
  gen_btn.click(
104
- fn=tts_stream,
105
- inputs=[text_in, temperature, top_p, repetition_penalty, state_audio],
106
- outputs=[audio_out, state_audio],
107
  )
108
 
109
- #download_btn.click(
110
- # fn=save_audio,
111
- # inputs=[state_audio],
112
- # outputs=[file_out],
113
- #)
114
-
115
  demo.queue()
116
- demo.launch()
 
29
  SAMPLE_RATE = 32000
30
 
31
  @spaces.GPU
32
+ def tts(text: str, temperature: float = 0.3, top_p: float = 0.95, repetition_penalty: float = 1.2) -> Tuple:
33
+ """
34
+ Runs Soprano text-to-speech model with the given input text and sampling parameters.
35
+
36
+ Returns:
37
+ (sr, audio) where sr is rthe sample rate (default 32000) and audio is the output audio as an np.ndarray.
38
+ """
39
  model = load_model()
40
 
41
  if not text.strip():
42
+ return None
 
43
 
44
  out = model.infer(
45
  text,
 
49
  )
50
 
51
  audio_np = out.cpu().numpy()
52
+ return (SAMPLE_RATE, audio_np)
 
 
 
 
 
 
 
 
 
53
 
54
 
55
  with gr.Blocks() as demo:
 
 
56
  with gr.Row():
57
  with gr.Column():
58
  gr.Markdown("# Soprano Demo\n\nSoprano is an ultra‑lightweight, open‑source text‑to‑speech (TTS) model designed for real‑time, high‑fidelity speech synthesis at unprecedented speed. Soprano can achieve **<15 ms streaming latency** and up to **2000x real-time generation**, all while being easy to deploy at **<1 GB VRAM usage**.\n\nGithub: https://github.com/ekwek1/soprano\n\nModel Weights: https://huggingface.co/ekwek/Soprano-80M")
 
95
 
96
 
97
  gen_btn.click(
98
+ fn=tts,
99
+ inputs=[text_in, temperature, top_p, repetition_penalty],
100
+ outputs=[audio_out],
101
  )
102
 
 
 
 
 
 
 
103
  demo.queue()
104
+ demo.launch(mcp_server=True)