Spaces:
Sleeping
Sleeping
File size: 5,731 Bytes
7e66c78 ec2e6d6 6137274 7e66c78 ec2e6d6 7e66c78 278ac29 ec2e6d6 278ac29 c2c4056 278ac29 7e66c78 278ac29 7e66c78 6137274 ec2e6d6 c2c4056 fa7f144 ec2e6d6 fa7f144 7e66c78 c2c4056 ec2e6d6 c2c4056 ec2e6d6 c2c4056 ec2e6d6 7e66c78 ec2e6d6 7e66c78 ec2e6d6 7e66c78 ec2e6d6 7e66c78 ec2e6d6 c2c4056 ec2e6d6 c2c4056 ec2e6d6 c2c4056 ec2e6d6 2cf68bf ec2e6d6 bf7fc52 ec2e6d6 7e66c78 ec2e6d6 7e66c78 2cf68bf ec2e6d6 7e66c78 ec2e6d6 7e66c78 ec2e6d6 7e66c78 ec2e6d6 7e66c78 ec2e6d6 7e66c78 ec2e6d6 2cf68bf ec2e6d6 c2c4056 2cf68bf ec2e6d6 c2c4056 ec2e6d6 c2c4056 ec2e6d6 f0ac9ac ec2e6d6 c2c4056 ec2e6d6 7e66c78 ec2e6d6 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 |
import gradio as gr
import torch
from soprano import SopranoTTS
import numpy as np
import socket
import time
import spaces
# Detect device
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
model = None
# Initialize model
@spaces.GPU
def load_model():
global model
if model is None:
model = SopranoTTS(
backend="auto",
device=DEVICE,
cache_size_mb=10000,
decoder_batch_size=8,
)
return model
SAMPLE_RATE = 32000
@spaces.GPU
def generate_speech(
text: str,
temperature: float = 0.3,
top_p: float = 0.95,
repetition_penalty: float = 1.2,
):
"""
Runs Soprano text-to-speech model with the given input text and sampling parameters.
Returns:
((sr, audio), status) where sr is the sample rate (default 32000), audio is the output audio as an np.ndarray, and status is the displayed output text.
"""
if not text.strip():
yield None, "Please enter some text to generate speech."
return
try: print(text.split('\n')[0])
except: pass
try:
yield None, "⏳ Loading model..."
model = load_model()
yield None, "⏳ Generating audio..."
start_time = time.perf_counter()
audio = model.infer(
text,
temperature=temperature,
top_p=top_p,
repetition_penalty=repetition_penalty,
)
gen_time = time.perf_counter() - start_time
audio_np = audio.cpu().numpy()
audio_int16 = (audio_np * 32767).astype(np.int16)
audio_seconds = len(audio_np) / SAMPLE_RATE
rtf = audio_seconds / gen_time if gen_time > 0 else float("inf")
status = (
f"✓ Generated {audio_seconds:.2f} s audio | "
f"Generation time: {gen_time:.3f} s "
f"({rtf:.2f}x realtime)"
)
yield (SAMPLE_RATE, audio_int16), status
return
except Exception as e:
yield None, f"✗ Error: {str(e)}"
# Create Gradio interface
with gr.Blocks(title="Soprano TTS") as demo:
gr.Markdown(
f"""
# 🗣️ Soprano TTS
<div align="center">
<img width="300" height="300" alt="soprano-github" src="https://github.com/user-attachments/assets/4d612eac-23b8-44e6-8c59-d7ac14ebafd1" />
</div>
**Running on: {DEVICE.upper()}**
**GitHub:** https://github.com/ekwek1/soprano
**Model Weights:** https://huggingface.co/ekwek/Soprano-1.1-80M
**Model Demo:** https://huggingface.co/spaces/ekwek/Soprano-TTS
"""
)
with gr.Row():
with gr.Column(scale=2):
text_input = gr.Textbox(
label="Text to Synthesize",
placeholder="Enter text here...",
value="Soprano is an extremely lightweight text to speech model designed to produce highly realistic speech at unprecedented speed.",
lines=5,
max_lines=10,
)
with gr.Accordion("Advanced Settings", open=False):
temperature = gr.Slider(
minimum=0.0,
maximum=1.0,
value=0.0,
step=0.05,
label="Temperature",
)
top_p = gr.Slider(
minimum=0.5,
maximum=1.0,
value=0.95,
step=0.05,
label="Top P",
)
repetition_penalty = gr.Slider(
minimum=1.0,
maximum=2.0,
value=1.2,
step=0.1,
label="Repetition Penalty",
)
generate_btn = gr.Button("Generate Speech", variant="primary", size="lg")
with gr.Column(scale=1):
audio_output = gr.Audio(
label="Generated Speech",
type="numpy",
autoplay=True,
)
status_output = gr.Textbox(
label="Status",
interactive=False,
lines=3,
max_lines=10
)
gr.Examples(
examples=[
["Soprano is an extremely lightweight text to speech model.", 0.0, 0.95, 1.2],
["Artificial intelligence is transforming the world.", 0.0, 0.90, 1.2],
["I'm so excited, I can't even wait!", 0.0, 0.95, 1.2],
["Why don't you go ahead and try it?", 0.0, 0.95, 1.2],
],
inputs=[text_input, temperature, top_p, repetition_penalty],
label="Example Prompts",
)
generate_btn.click(
fn=generate_speech,
inputs=[text_input, temperature, top_p, repetition_penalty],
outputs=[audio_output, status_output],
)
gr.Markdown(
f"""
### Usage tips:
- Note: Soprano is currently **English-only**. Other languages are not guaranteed to work.
- Soprano works best when each sentence is between 2 and 30 seconds long.
- Although Soprano recognizes numbers and some special characters, it occasionally mispronounces them.
Best results can be achieved by converting these into their phonetic form.
(1+1 -> one plus one, etc)
- If Soprano produces unsatisfactory results, you can easily regenerate it for a new, potentially better generation.
You may also change the sampling settings for more varied results.
- Avoid improper grammar such as not using contractions, multiple spaces, etc.
"""
)
def main():
# Start Gradio interface
demo.launch(
mcp_server=True,
theme=gr.themes.Soft(primary_hue="green"),
css="""
a {
color: var(--primary-600);
}
a:hover {
color: var(--primary-700);
}
"""
)
if __name__ == "__main__":
main() |