ekwek commited on
Commit
c2c4056
·
verified ·
1 Parent(s): 7a89eff

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -29
app.py CHANGED
@@ -18,8 +18,8 @@ def load_model():
18
  model = SopranoTTS(
19
  backend="auto",
20
  device=DEVICE,
21
- cache_size_mb=100,
22
- decoder_batch_size=1,
23
  )
24
  return model
25
 
@@ -31,18 +31,22 @@ def generate_speech(
31
  temperature: float = 0.3,
32
  top_p: float = 0.95,
33
  repetition_penalty: float = 1.2,
34
- ) -> tuple:
35
  """
36
  Runs Soprano text-to-speech model with the given input text and sampling parameters.
37
-
38
  Returns:
39
  ((sr, audio), status) where sr is the sample rate (default 32000), audio is the output audio as an np.ndarray, and status is the displayed output text.
40
  """
41
  if not text.strip():
42
- return None, "Please enter some text to generate speech."
43
- print(text)
 
 
 
44
  try:
 
45
  model = load_model()
 
46
  start_time = time.perf_counter()
47
 
48
  audio = model.infer(
@@ -66,31 +70,29 @@ def generate_speech(
66
  f"({rtf:.2f}x realtime)"
67
  )
68
 
69
- return (SAMPLE_RATE, audio_int16), status
 
70
 
71
  except Exception as e:
72
- return None, f"✗ Error: {str(e)}"
73
 
74
 
75
  # Create Gradio interface
76
  with gr.Blocks(title="Soprano TTS") as demo:
77
-
78
  gr.Markdown(
79
  f"""
80
  # 🗣️ Soprano TTS
 
 
 
81
 
82
  **Running on: {DEVICE.upper()}**
83
 
84
- Soprano is an ultra-lightweight, open-source text-to-speech (TTS) model designed for real-time,
85
- high-fidelity speech synthesis at unprecedented speed. Soprano can achieve **<15 ms streaming latency**
86
- and up to **2000x real-time generation**, all while being easy to deploy at **<1 GB VRAM usage**.
87
-
88
  **GitHub:** https://github.com/ekwek1/soprano
 
89
  **Model Demo:** https://huggingface.co/spaces/ekwek/Soprano-TTS
90
- **Model Weights:** https://huggingface.co/ekwek/Soprano-80M
91
  """
92
  )
93
-
94
  with gr.Row():
95
  with gr.Column(scale=2):
96
  text_input = gr.Textbox(
@@ -100,7 +102,6 @@ and up to **2000x real-time generation**, all while being easy to deploy at **<1
100
  lines=5,
101
  max_lines=10,
102
  )
103
-
104
  with gr.Accordion("Advanced Settings", open=False):
105
  temperature = gr.Slider(
106
  minimum=0.1,
@@ -109,7 +110,6 @@ and up to **2000x real-time generation**, all while being easy to deploy at **<1
109
  step=0.05,
110
  label="Temperature",
111
  )
112
-
113
  top_p = gr.Slider(
114
  minimum=0.5,
115
  maximum=1.0,
@@ -117,7 +117,6 @@ and up to **2000x real-time generation**, all while being easy to deploy at **<1
117
  step=0.05,
118
  label="Top P",
119
  )
120
-
121
  repetition_penalty = gr.Slider(
122
  minimum=1.0,
123
  maximum=2.0,
@@ -125,34 +124,29 @@ and up to **2000x real-time generation**, all while being easy to deploy at **<1
125
  step=0.1,
126
  label="Repetition Penalty",
127
  )
128
-
129
  generate_btn = gr.Button("Generate Speech", variant="primary", size="lg")
130
-
131
  with gr.Column(scale=1):
132
  audio_output = gr.Audio(
133
  label="Generated Speech",
134
  type="numpy",
135
  autoplay=True,
136
  )
137
-
138
  status_output = gr.Textbox(
139
  label="Status",
140
  interactive=False,
141
  lines=3,
142
  max_lines=10
143
  )
144
-
145
  gr.Examples(
146
  examples=[
147
  ["Soprano is an extremely lightweight text to speech model.", 0.3, 0.95, 1.2],
148
- ["Hello! Welcome to Soprano text to speech.", 0.3, 0.95, 1.2],
149
- ["The quick brown fox jumps over the lazy dog.", 0.3, 0.95, 1.2],
150
  ["Artificial intelligence is transforming the world.", 0.5, 0.90, 1.2],
 
 
151
  ],
152
  inputs=[text_input, temperature, top_p, repetition_penalty],
153
  label="Example Prompts",
154
  )
155
-
156
  generate_btn.click(
157
  fn=generate_speech,
158
  inputs=[text_input, temperature, top_p, repetition_penalty],
@@ -161,18 +155,19 @@ and up to **2000x real-time generation**, all while being easy to deploy at **<1
161
  gr.Markdown(
162
  f"""
163
  ### Usage tips:
164
-
165
  - Soprano works best when each sentence is between 2 and 15 seconds long.
166
  - Although Soprano recognizes numbers and some special characters, it occasionally mispronounces them.
167
- Best results can be achieved by converting these into their phonetic form.
168
- (1+1 -> one plus one, etc)
169
  - If Soprano produces unsatisfactory results, you can easily regenerate it for a new, potentially better generation.
170
- You may also change the sampling settings for more varied results.
171
  - Avoid improper grammar such as not using contractions, multiple spaces, etc.
172
  """
173
  )
174
 
175
  def main():
 
176
  demo.launch(
177
  mcp_server=True,
178
  theme=gr.themes.Soft(primary_hue="green"),
 
18
  model = SopranoTTS(
19
  backend="auto",
20
  device=DEVICE,
21
+ cache_size_mb=10000,
22
+ decoder_batch_size=8,
23
  )
24
  return model
25
 
 
31
  temperature: float = 0.3,
32
  top_p: float = 0.95,
33
  repetition_penalty: float = 1.2,
34
+ ):
35
  """
36
  Runs Soprano text-to-speech model with the given input text and sampling parameters.
 
37
  Returns:
38
  ((sr, audio), status) where sr is the sample rate (default 32000), audio is the output audio as an np.ndarray, and status is the displayed output text.
39
  """
40
  if not text.strip():
41
+ yield None, "Please enter some text to generate speech."
42
+ return
43
+ try: print(text.split('\n')[0])
44
+ except: pass
45
+
46
  try:
47
+ yield None, "⏳ Loading model..."
48
  model = load_model()
49
+ yield None, "⏳ Generating audio..."
50
  start_time = time.perf_counter()
51
 
52
  audio = model.infer(
 
70
  f"({rtf:.2f}x realtime)"
71
  )
72
 
73
+ yield (SAMPLE_RATE, audio_int16), status
74
+ return
75
 
76
  except Exception as e:
77
+ yield None, f"✗ Error: {str(e)}"
78
 
79
 
80
  # Create Gradio interface
81
  with gr.Blocks(title="Soprano TTS") as demo:
 
82
  gr.Markdown(
83
  f"""
84
  # 🗣️ Soprano TTS
85
+ <div align="center">
86
+ <img width="300" height="300" alt="soprano-github" src="https://github.com/user-attachments/assets/4d612eac-23b8-44e6-8c59-d7ac14ebafd1" />
87
+ </div>
88
 
89
  **Running on: {DEVICE.upper()}**
90
 
 
 
 
 
91
  **GitHub:** https://github.com/ekwek1/soprano
92
+ **Model Weights:** https://huggingface.co/ekwek/Soprano-80M
93
  **Model Demo:** https://huggingface.co/spaces/ekwek/Soprano-TTS
 
94
  """
95
  )
 
96
  with gr.Row():
97
  with gr.Column(scale=2):
98
  text_input = gr.Textbox(
 
102
  lines=5,
103
  max_lines=10,
104
  )
 
105
  with gr.Accordion("Advanced Settings", open=False):
106
  temperature = gr.Slider(
107
  minimum=0.1,
 
110
  step=0.05,
111
  label="Temperature",
112
  )
 
113
  top_p = gr.Slider(
114
  minimum=0.5,
115
  maximum=1.0,
 
117
  step=0.05,
118
  label="Top P",
119
  )
 
120
  repetition_penalty = gr.Slider(
121
  minimum=1.0,
122
  maximum=2.0,
 
124
  step=0.1,
125
  label="Repetition Penalty",
126
  )
 
127
  generate_btn = gr.Button("Generate Speech", variant="primary", size="lg")
 
128
  with gr.Column(scale=1):
129
  audio_output = gr.Audio(
130
  label="Generated Speech",
131
  type="numpy",
132
  autoplay=True,
133
  )
 
134
  status_output = gr.Textbox(
135
  label="Status",
136
  interactive=False,
137
  lines=3,
138
  max_lines=10
139
  )
 
140
  gr.Examples(
141
  examples=[
142
  ["Soprano is an extremely lightweight text to speech model.", 0.3, 0.95, 1.2],
 
 
143
  ["Artificial intelligence is transforming the world.", 0.5, 0.90, 1.2],
144
+ ["I'm so excited, I can't even wait!", 0.3, 0.95, 1.2],
145
+ ["Why don't you go ahead and try it?", 0.3, 0.95, 1.2],
146
  ],
147
  inputs=[text_input, temperature, top_p, repetition_penalty],
148
  label="Example Prompts",
149
  )
 
150
  generate_btn.click(
151
  fn=generate_speech,
152
  inputs=[text_input, temperature, top_p, repetition_penalty],
 
155
  gr.Markdown(
156
  f"""
157
  ### Usage tips:
158
+ - Note: Soprano is currently **English-only**. Other languages are not guaranteed to work.
159
  - Soprano works best when each sentence is between 2 and 15 seconds long.
160
  - Although Soprano recognizes numbers and some special characters, it occasionally mispronounces them.
161
+ Best results can be achieved by converting these into their phonetic form.
162
+ (1+1 -> one plus one, etc)
163
  - If Soprano produces unsatisfactory results, you can easily regenerate it for a new, potentially better generation.
164
+ You may also change the sampling settings for more varied results.
165
  - Avoid improper grammar such as not using contractions, multiple spaces, etc.
166
  """
167
  )
168
 
169
  def main():
170
+ # Start Gradio interface
171
  demo.launch(
172
  mcp_server=True,
173
  theme=gr.themes.Soft(primary_hue="green"),