Spaces:
Sleeping
Sleeping
Update app.py
Browse files
app.py
CHANGED
|
@@ -18,8 +18,8 @@ def load_model():
|
|
| 18 |
model = SopranoTTS(
|
| 19 |
backend="auto",
|
| 20 |
device=DEVICE,
|
| 21 |
-
cache_size_mb=
|
| 22 |
-
decoder_batch_size=
|
| 23 |
)
|
| 24 |
return model
|
| 25 |
|
|
@@ -31,18 +31,22 @@ def generate_speech(
|
|
| 31 |
temperature: float = 0.3,
|
| 32 |
top_p: float = 0.95,
|
| 33 |
repetition_penalty: float = 1.2,
|
| 34 |
-
)
|
| 35 |
"""
|
| 36 |
Runs Soprano text-to-speech model with the given input text and sampling parameters.
|
| 37 |
-
|
| 38 |
Returns:
|
| 39 |
((sr, audio), status) where sr is the sample rate (default 32000), audio is the output audio as an np.ndarray, and status is the displayed output text.
|
| 40 |
"""
|
| 41 |
if not text.strip():
|
| 42 |
-
|
| 43 |
-
|
|
|
|
|
|
|
|
|
|
| 44 |
try:
|
|
|
|
| 45 |
model = load_model()
|
|
|
|
| 46 |
start_time = time.perf_counter()
|
| 47 |
|
| 48 |
audio = model.infer(
|
|
@@ -66,31 +70,29 @@ def generate_speech(
|
|
| 66 |
f"({rtf:.2f}x realtime)"
|
| 67 |
)
|
| 68 |
|
| 69 |
-
|
|
|
|
| 70 |
|
| 71 |
except Exception as e:
|
| 72 |
-
|
| 73 |
|
| 74 |
|
| 75 |
# Create Gradio interface
|
| 76 |
with gr.Blocks(title="Soprano TTS") as demo:
|
| 77 |
-
|
| 78 |
gr.Markdown(
|
| 79 |
f"""
|
| 80 |
# 🗣️ Soprano TTS
|
|
|
|
|
|
|
|
|
|
| 81 |
|
| 82 |
**Running on: {DEVICE.upper()}**
|
| 83 |
|
| 84 |
-
Soprano is an ultra-lightweight, open-source text-to-speech (TTS) model designed for real-time,
|
| 85 |
-
high-fidelity speech synthesis at unprecedented speed. Soprano can achieve **<15 ms streaming latency**
|
| 86 |
-
and up to **2000x real-time generation**, all while being easy to deploy at **<1 GB VRAM usage**.
|
| 87 |
-
|
| 88 |
**GitHub:** https://github.com/ekwek1/soprano
|
|
|
|
| 89 |
**Model Demo:** https://huggingface.co/spaces/ekwek/Soprano-TTS
|
| 90 |
-
**Model Weights:** https://huggingface.co/ekwek/Soprano-80M
|
| 91 |
"""
|
| 92 |
)
|
| 93 |
-
|
| 94 |
with gr.Row():
|
| 95 |
with gr.Column(scale=2):
|
| 96 |
text_input = gr.Textbox(
|
|
@@ -100,7 +102,6 @@ and up to **2000x real-time generation**, all while being easy to deploy at **<1
|
|
| 100 |
lines=5,
|
| 101 |
max_lines=10,
|
| 102 |
)
|
| 103 |
-
|
| 104 |
with gr.Accordion("Advanced Settings", open=False):
|
| 105 |
temperature = gr.Slider(
|
| 106 |
minimum=0.1,
|
|
@@ -109,7 +110,6 @@ and up to **2000x real-time generation**, all while being easy to deploy at **<1
|
|
| 109 |
step=0.05,
|
| 110 |
label="Temperature",
|
| 111 |
)
|
| 112 |
-
|
| 113 |
top_p = gr.Slider(
|
| 114 |
minimum=0.5,
|
| 115 |
maximum=1.0,
|
|
@@ -117,7 +117,6 @@ and up to **2000x real-time generation**, all while being easy to deploy at **<1
|
|
| 117 |
step=0.05,
|
| 118 |
label="Top P",
|
| 119 |
)
|
| 120 |
-
|
| 121 |
repetition_penalty = gr.Slider(
|
| 122 |
minimum=1.0,
|
| 123 |
maximum=2.0,
|
|
@@ -125,34 +124,29 @@ and up to **2000x real-time generation**, all while being easy to deploy at **<1
|
|
| 125 |
step=0.1,
|
| 126 |
label="Repetition Penalty",
|
| 127 |
)
|
| 128 |
-
|
| 129 |
generate_btn = gr.Button("Generate Speech", variant="primary", size="lg")
|
| 130 |
-
|
| 131 |
with gr.Column(scale=1):
|
| 132 |
audio_output = gr.Audio(
|
| 133 |
label="Generated Speech",
|
| 134 |
type="numpy",
|
| 135 |
autoplay=True,
|
| 136 |
)
|
| 137 |
-
|
| 138 |
status_output = gr.Textbox(
|
| 139 |
label="Status",
|
| 140 |
interactive=False,
|
| 141 |
lines=3,
|
| 142 |
max_lines=10
|
| 143 |
)
|
| 144 |
-
|
| 145 |
gr.Examples(
|
| 146 |
examples=[
|
| 147 |
["Soprano is an extremely lightweight text to speech model.", 0.3, 0.95, 1.2],
|
| 148 |
-
["Hello! Welcome to Soprano text to speech.", 0.3, 0.95, 1.2],
|
| 149 |
-
["The quick brown fox jumps over the lazy dog.", 0.3, 0.95, 1.2],
|
| 150 |
["Artificial intelligence is transforming the world.", 0.5, 0.90, 1.2],
|
|
|
|
|
|
|
| 151 |
],
|
| 152 |
inputs=[text_input, temperature, top_p, repetition_penalty],
|
| 153 |
label="Example Prompts",
|
| 154 |
)
|
| 155 |
-
|
| 156 |
generate_btn.click(
|
| 157 |
fn=generate_speech,
|
| 158 |
inputs=[text_input, temperature, top_p, repetition_penalty],
|
|
@@ -161,18 +155,19 @@ and up to **2000x real-time generation**, all while being easy to deploy at **<1
|
|
| 161 |
gr.Markdown(
|
| 162 |
f"""
|
| 163 |
### Usage tips:
|
| 164 |
-
|
| 165 |
- Soprano works best when each sentence is between 2 and 15 seconds long.
|
| 166 |
- Although Soprano recognizes numbers and some special characters, it occasionally mispronounces them.
|
| 167 |
-
|
| 168 |
-
|
| 169 |
- If Soprano produces unsatisfactory results, you can easily regenerate it for a new, potentially better generation.
|
| 170 |
-
|
| 171 |
- Avoid improper grammar such as not using contractions, multiple spaces, etc.
|
| 172 |
"""
|
| 173 |
)
|
| 174 |
|
| 175 |
def main():
|
|
|
|
| 176 |
demo.launch(
|
| 177 |
mcp_server=True,
|
| 178 |
theme=gr.themes.Soft(primary_hue="green"),
|
|
|
|
| 18 |
model = SopranoTTS(
|
| 19 |
backend="auto",
|
| 20 |
device=DEVICE,
|
| 21 |
+
cache_size_mb=10000,
|
| 22 |
+
decoder_batch_size=8,
|
| 23 |
)
|
| 24 |
return model
|
| 25 |
|
|
|
|
| 31 |
temperature: float = 0.3,
|
| 32 |
top_p: float = 0.95,
|
| 33 |
repetition_penalty: float = 1.2,
|
| 34 |
+
):
|
| 35 |
"""
|
| 36 |
Runs Soprano text-to-speech model with the given input text and sampling parameters.
|
|
|
|
| 37 |
Returns:
|
| 38 |
((sr, audio), status) where sr is the sample rate (default 32000), audio is the output audio as an np.ndarray, and status is the displayed output text.
|
| 39 |
"""
|
| 40 |
if not text.strip():
|
| 41 |
+
yield None, "Please enter some text to generate speech."
|
| 42 |
+
return
|
| 43 |
+
try: print(text.split('\n')[0])
|
| 44 |
+
except: pass
|
| 45 |
+
|
| 46 |
try:
|
| 47 |
+
yield None, "⏳ Loading model..."
|
| 48 |
model = load_model()
|
| 49 |
+
yield None, "⏳ Generating audio..."
|
| 50 |
start_time = time.perf_counter()
|
| 51 |
|
| 52 |
audio = model.infer(
|
|
|
|
| 70 |
f"({rtf:.2f}x realtime)"
|
| 71 |
)
|
| 72 |
|
| 73 |
+
yield (SAMPLE_RATE, audio_int16), status
|
| 74 |
+
return
|
| 75 |
|
| 76 |
except Exception as e:
|
| 77 |
+
yield None, f"✗ Error: {str(e)}"
|
| 78 |
|
| 79 |
|
| 80 |
# Create Gradio interface
|
| 81 |
with gr.Blocks(title="Soprano TTS") as demo:
|
|
|
|
| 82 |
gr.Markdown(
|
| 83 |
f"""
|
| 84 |
# 🗣️ Soprano TTS
|
| 85 |
+
<div align="center">
|
| 86 |
+
<img width="300" height="300" alt="soprano-github" src="https://github.com/user-attachments/assets/4d612eac-23b8-44e6-8c59-d7ac14ebafd1" />
|
| 87 |
+
</div>
|
| 88 |
|
| 89 |
**Running on: {DEVICE.upper()}**
|
| 90 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 91 |
**GitHub:** https://github.com/ekwek1/soprano
|
| 92 |
+
**Model Weights:** https://huggingface.co/ekwek/Soprano-80M
|
| 93 |
**Model Demo:** https://huggingface.co/spaces/ekwek/Soprano-TTS
|
|
|
|
| 94 |
"""
|
| 95 |
)
|
|
|
|
| 96 |
with gr.Row():
|
| 97 |
with gr.Column(scale=2):
|
| 98 |
text_input = gr.Textbox(
|
|
|
|
| 102 |
lines=5,
|
| 103 |
max_lines=10,
|
| 104 |
)
|
|
|
|
| 105 |
with gr.Accordion("Advanced Settings", open=False):
|
| 106 |
temperature = gr.Slider(
|
| 107 |
minimum=0.1,
|
|
|
|
| 110 |
step=0.05,
|
| 111 |
label="Temperature",
|
| 112 |
)
|
|
|
|
| 113 |
top_p = gr.Slider(
|
| 114 |
minimum=0.5,
|
| 115 |
maximum=1.0,
|
|
|
|
| 117 |
step=0.05,
|
| 118 |
label="Top P",
|
| 119 |
)
|
|
|
|
| 120 |
repetition_penalty = gr.Slider(
|
| 121 |
minimum=1.0,
|
| 122 |
maximum=2.0,
|
|
|
|
| 124 |
step=0.1,
|
| 125 |
label="Repetition Penalty",
|
| 126 |
)
|
|
|
|
| 127 |
generate_btn = gr.Button("Generate Speech", variant="primary", size="lg")
|
|
|
|
| 128 |
with gr.Column(scale=1):
|
| 129 |
audio_output = gr.Audio(
|
| 130 |
label="Generated Speech",
|
| 131 |
type="numpy",
|
| 132 |
autoplay=True,
|
| 133 |
)
|
|
|
|
| 134 |
status_output = gr.Textbox(
|
| 135 |
label="Status",
|
| 136 |
interactive=False,
|
| 137 |
lines=3,
|
| 138 |
max_lines=10
|
| 139 |
)
|
|
|
|
| 140 |
gr.Examples(
|
| 141 |
examples=[
|
| 142 |
["Soprano is an extremely lightweight text to speech model.", 0.3, 0.95, 1.2],
|
|
|
|
|
|
|
| 143 |
["Artificial intelligence is transforming the world.", 0.5, 0.90, 1.2],
|
| 144 |
+
["I'm so excited, I can't even wait!", 0.3, 0.95, 1.2],
|
| 145 |
+
["Why don't you go ahead and try it?", 0.3, 0.95, 1.2],
|
| 146 |
],
|
| 147 |
inputs=[text_input, temperature, top_p, repetition_penalty],
|
| 148 |
label="Example Prompts",
|
| 149 |
)
|
|
|
|
| 150 |
generate_btn.click(
|
| 151 |
fn=generate_speech,
|
| 152 |
inputs=[text_input, temperature, top_p, repetition_penalty],
|
|
|
|
| 155 |
gr.Markdown(
|
| 156 |
f"""
|
| 157 |
### Usage tips:
|
| 158 |
+
- Note: Soprano is currently **English-only**. Other languages are not guaranteed to work.
|
| 159 |
- Soprano works best when each sentence is between 2 and 15 seconds long.
|
| 160 |
- Although Soprano recognizes numbers and some special characters, it occasionally mispronounces them.
|
| 161 |
+
Best results can be achieved by converting these into their phonetic form.
|
| 162 |
+
(1+1 -> one plus one, etc)
|
| 163 |
- If Soprano produces unsatisfactory results, you can easily regenerate it for a new, potentially better generation.
|
| 164 |
+
You may also change the sampling settings for more varied results.
|
| 165 |
- Avoid improper grammar such as not using contractions, multiple spaces, etc.
|
| 166 |
"""
|
| 167 |
)
|
| 168 |
|
| 169 |
def main():
|
| 170 |
+
# Start Gradio interface
|
| 171 |
demo.launch(
|
| 172 |
mcp_server=True,
|
| 173 |
theme=gr.themes.Soft(primary_hue="green"),
|