ekwek commited on
Commit
ec2e6d6
·
verified ·
1 Parent(s): 6301f82

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +140 -53
app.py CHANGED
@@ -1,22 +1,20 @@
1
  import gradio as gr
2
  import torch
3
- import numpy as np
4
  from soprano import SopranoTTS
5
- from scipy.io.wavfile import write as wav_write
6
- import tempfile
7
- import os
8
  import spaces
9
 
10
- assert torch.cuda.is_available(), "Demo requires a GPU."
11
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
12
- print(DEVICE)
13
-
14
  model = None
15
 
 
 
16
  def load_model():
17
  global model
18
  if model is None:
19
- # Load model once
20
  model = SopranoTTS(
21
  backend="auto",
22
  device=DEVICE,
@@ -25,80 +23,169 @@ def load_model():
25
  )
26
  return model
27
 
28
-
29
  SAMPLE_RATE = 32000
30
 
31
  @spaces.GPU
32
- def tts(text: str, temperature: float = 0.3, top_p: float = 0.95, repetition_penalty: float = 1.2) -> tuple[int, np.ndarray]:
 
 
 
 
 
33
  """
34
  Runs Soprano text-to-speech model with the given input text and sampling parameters.
35
 
36
  Returns:
37
- (sr, audio) where sr is rthe sample rate (default 32000) and audio is the output audio as an np.ndarray.
38
  """
39
- model = load_model()
40
-
41
  if not text.strip():
42
- return None
 
 
 
 
 
 
 
 
 
 
 
43
 
44
- out = model.infer(
45
- text,
46
- temperature=temperature,
47
- top_p=top_p,
48
- repetition_penalty=repetition_penalty,
49
- )
50
 
51
- audio_np = out.cpu().numpy()
52
- return (SAMPLE_RATE, audio_np)
53
 
 
 
54
 
55
- with gr.Blocks() as demo:
56
- with gr.Row():
57
- with gr.Column():
58
- gr.Markdown("# Soprano Demo\n\nSoprano is an ultra‑lightweight, open‑source text‑to‑speech (TTS) model designed for real‑time, high‑fidelity speech synthesis at unprecedented speed. Soprano can achieve **<15 ms streaming latency** and up to **2000x real-time generation**, all while being easy to deploy at **<1 GB VRAM usage**.\n\nGithub: https://github.com/ekwek1/soprano\n\nModel Weights: https://huggingface.co/ekwek/Soprano-80M")
 
 
 
 
 
 
 
 
 
 
59
 
60
- text_in = gr.Textbox(
61
- label="Input Text",
62
- placeholder="Enter text to synthesize...",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
63
  value="Soprano is an extremely lightweight text to speech model designed to produce highly realistic speech at unprecedented speed.",
64
- lines=4,
 
65
  )
66
 
67
- with gr.Accordion("Advanced options", open=False):
68
  temperature = gr.Slider(
69
- 0.0, 1.0, value=0.3, step=0.05, label="Temperature"
 
 
 
 
70
  )
 
71
  top_p = gr.Slider(
72
- 0.0, 1.0, value=0.95, step=0.01, label="Top-p"
 
 
 
 
73
  )
 
74
  repetition_penalty = gr.Slider(
75
- 1.0, 2.0, value=1.2, step=0.05, label="Repetition penalty"
 
 
 
 
76
  )
77
 
78
- gen_btn = gr.Button("Generate")
79
 
80
- with gr.Column():
81
- audio_out = gr.Audio(
82
- label="Output Audio",
 
83
  autoplay=True,
84
- streaming=False,
85
  )
86
- #download_btn = gr.Button("Download")
87
- #file_out = gr.File(label="Download file")
88
- gr.Markdown(
89
- "Usage tips:\n\n"
90
- "- Soprano works best when each sentence is between 2 and 15 seconds long.\n"
91
- "- Although Soprano recognizes numbers and some special characters, it occasionally mispronounces them. Best results can be achieved by converting these into their phonetic form. (1+1 -> one plus one, etc)\n"
92
- "- If Soprano produces unsatisfactory results, you can easily regenerate it for a new, potentially better generation. You may also change the sampling settings for more varied results.\n"
93
- "- Avoid improper grammar such as not using contractions, multiple spaces, etc."
94
  )
95
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
 
97
- gen_btn.click(
98
- fn=tts,
99
- inputs=[text_in, temperature, top_p, repetition_penalty],
100
- outputs=[audio_out],
 
 
 
 
 
 
 
 
101
  )
102
 
103
- demo.queue()
104
- demo.launch(mcp_server=True)
 
 
1
  import gradio as gr
2
  import torch
 
3
  from soprano import SopranoTTS
4
+ import numpy as np
5
+ import socket
6
+ import time
7
  import spaces
8
 
9
+ # Detect device
10
  DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 
 
11
  model = None
12
 
13
+ # Initialize model
14
+ @spaces.GPU
15
  def load_model():
16
  global model
17
  if model is None:
 
18
  model = SopranoTTS(
19
  backend="auto",
20
  device=DEVICE,
 
23
  )
24
  return model
25
 
 
26
  SAMPLE_RATE = 32000
27
 
28
  @spaces.GPU
29
+ def generate_speech(
30
+ text: str,
31
+ temperature: float = 0.3,
32
+ top_p: float = 0.95,
33
+ repetition_penalty: float = 1.2,
34
+ ) -> tuple:
35
  """
36
  Runs Soprano text-to-speech model with the given input text and sampling parameters.
37
 
38
  Returns:
39
+ ((sr, audio), status) where sr is the sample rate (default 32000), audio is the output audio as an np.ndarray, and status is the displayed output text.
40
  """
 
 
41
  if not text.strip():
42
+ return None, "Please enter some text to generate speech."
43
+ print(text)
44
+ try:
45
+ model = load_model()
46
+ start_time = time.perf_counter()
47
+
48
+ audio = model.infer(
49
+ text,
50
+ temperature=temperature,
51
+ top_p=top_p,
52
+ repetition_penalty=repetition_penalty,
53
+ )
54
 
55
+ gen_time = time.perf_counter() - start_time
 
 
 
 
 
56
 
57
+ audio_np = audio.cpu().numpy()
58
+ audio_int16 = (audio_np * 32767).astype(np.int16)
59
 
60
+ audio_seconds = len(audio_np) / SAMPLE_RATE
61
+ rtf = audio_seconds / gen_time if gen_time > 0 else float("inf")
62
 
63
+ status = (
64
+ f"✓ Generated {audio_seconds:.2f} s audio | "
65
+ f"Generation time: {gen_time:.3f} s "
66
+ f"({rtf:.2f}x realtime)"
67
+ )
68
+
69
+ return (SAMPLE_RATE, audio_int16), status
70
+
71
+ except Exception as e:
72
+ return None, f"✗ Error: {str(e)}"
73
+
74
+
75
+ # Create Gradio interface
76
+ with gr.Blocks(title="Soprano TTS") as demo:
77
 
78
+ gr.Markdown(
79
+ f"""
80
+ # 🗣️ Soprano TTS
81
+
82
+ **Running on: {DEVICE.upper()}**
83
+
84
+ Soprano is an ultra-lightweight, open-source text-to-speech (TTS) model designed for real-time,
85
+ high-fidelity speech synthesis at unprecedented speed. Soprano can achieve **<15 ms streaming latency**
86
+ and up to **2000x real-time generation**, all while being easy to deploy at **<1 GB VRAM usage**.
87
+
88
+ **GitHub:** https://github.com/ekwek1/soprano
89
+ **Model Demo:** https://huggingface.co/spaces/ekwek/Soprano-TTS
90
+ **Model Weights:** https://huggingface.co/ekwek/Soprano-80M
91
+ """
92
+ )
93
+
94
+ with gr.Row():
95
+ with gr.Column(scale=2):
96
+ text_input = gr.Textbox(
97
+ label="Text to Synthesize",
98
+ placeholder="Enter text here...",
99
  value="Soprano is an extremely lightweight text to speech model designed to produce highly realistic speech at unprecedented speed.",
100
+ lines=5,
101
+ max_lines=10,
102
  )
103
 
104
+ with gr.Accordion("Advanced Settings", open=False):
105
  temperature = gr.Slider(
106
+ minimum=0.1,
107
+ maximum=1.5,
108
+ value=0.3,
109
+ step=0.05,
110
+ label="Temperature",
111
  )
112
+
113
  top_p = gr.Slider(
114
+ minimum=0.5,
115
+ maximum=1.0,
116
+ value=0.95,
117
+ step=0.05,
118
+ label="Top P",
119
  )
120
+
121
  repetition_penalty = gr.Slider(
122
+ minimum=1.0,
123
+ maximum=2.0,
124
+ value=1.2,
125
+ step=0.1,
126
+ label="Repetition Penalty",
127
  )
128
 
129
+ generate_btn = gr.Button("Generate Speech", variant="primary", size="lg")
130
 
131
+ with gr.Column(scale=1):
132
+ audio_output = gr.Audio(
133
+ label="Generated Speech",
134
+ type="numpy",
135
  autoplay=True,
 
136
  )
137
+
138
+ status_output = gr.Textbox(
139
+ label="Status",
140
+ interactive=False,
141
+ lines=3,
142
+ max_lines=10
 
 
143
  )
144
 
145
+ gr.Examples(
146
+ examples=[
147
+ ["Soprano is an extremely lightweight text to speech model.", 0.3, 0.95, 1.2],
148
+ ["Hello! Welcome to Soprano text to speech.", 0.3, 0.95, 1.2],
149
+ ["The quick brown fox jumps over the lazy dog.", 0.3, 0.95, 1.2],
150
+ ["Artificial intelligence is transforming the world.", 0.5, 0.90, 1.2],
151
+ ],
152
+ inputs=[text_input, temperature, top_p, repetition_penalty],
153
+ label="Example Prompts",
154
+ )
155
+
156
+ generate_btn.click(
157
+ fn=generate_speech,
158
+ inputs=[text_input, temperature, top_p, repetition_penalty],
159
+ outputs=[audio_output, status_output],
160
+ )
161
+ gr.Markdown(
162
+ f"""
163
+ ### Usage tips:
164
+
165
+ - Soprano works best when each sentence is between 2 and 15 seconds long.
166
+ - Although Soprano recognizes numbers and some special characters, it occasionally mispronounces them.
167
+ Best results can be achieved by converting these into their phonetic form.
168
+ (1+1 -> one plus one, etc)
169
+ - If Soprano produces unsatisfactory results, you can easily regenerate it for a new, potentially better generation.
170
+ You may also change the sampling settings for more varied results.
171
+ - Avoid improper grammar such as not using contractions, multiple spaces, etc.
172
+ """
173
+ )
174
 
175
+ def main():
176
+ demo.launch(
177
+ mcp_server=True,
178
+ theme=gr.themes.Soft(primary_hue="green"),
179
+ css="""
180
+ a {
181
+ color: var(--primary-600);
182
+ }
183
+ a:hover {
184
+ color: var(--primary-700);
185
+ }
186
+ """
187
  )
188
 
189
+ if __name__ == "__main__":
190
+ load_model()
191
+ main()