sopro-test / app.py
arjunbroepic's picture
Update app.py
1870bb0 verified
import gradio as gr
import tempfile
import torch
from sopro import SoproTTS
# Initialize the model globally so it only loads once when the Space starts
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Loading Sopro TTS on device: {device}")
tts = SoproTTS.from_pretrained("samuel-vitorino/sopro", device=device)
def generate_speech(text, ref_audio_path, temperature, top_p, style_strength):
if not text:
raise gr.Error("Please enter some text to synthesize.")
if not ref_audio_path:
raise gr.Error("Please upload or record a reference audio file.")
try:
# Generate the audio wave
wav = tts.synthesize(
text=text,
ref_audio_path=ref_audio_path,
temperature=temperature,
top_p=top_p,
style_strength=style_strength
)
# Save output to a temporary file for Gradio to serve
temp_out = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
tts.save_wav(temp_out.name, wav)
return temp_out.name
except Exception as e:
raise gr.Error(f"Error during synthesis: {str(e)}")
# Define the Gradio Interface
with gr.Blocks(title="Sopro TTS - Voice Cloning") as demo:
gr.Markdown("# 🌬️ Sopro TTS - Zero-Shot Voice Cloning")
gr.Markdown(
"A lightweight (135M parameter) text-to-speech model with zero-shot voice cloning by [Samuel Vitorino](https://github.com/samuel-vitorino/sopro). "
"Upload a 3-12 second audio clip to clone a voice!"
)
with gr.Row():
with gr.Column():
text_input = gr.Textbox(
label="Text to Synthesize",
lines=4,
placeholder="Enter text here... (Prefer words over abbreviations/symbols, e.g., '1 plus 2' instead of '1 + 2')"
)
# Type 'filepath' passes the path of the uploaded file to our function
ref_audio_input = gr.Audio(
label="Reference Audio (3 to 12 seconds recommended)",
type="filepath",
sources=["upload", "microphone"]
)
with gr.Accordion("Advanced Parameters", open=False):
temp_slider = gr.Slider(minimum=0.1, maximum=2.0, value=0.8, step=0.1, label="Temperature")
top_p_slider = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top P")
style_slider = gr.Slider(minimum=0.0, maximum=3.0, value=1.2, step=0.1, label="Style Strength", info="Controls the FiLM strength; increasing it can improve or reduce voice similarity.")
generate_btn = gr.Button("Generate Speech", variant="primary")
with gr.Column():
audio_output = gr.Audio(label="Generated Audio", autoplay=False)
# Connect UI elements to the function
generate_btn.click(
fn=generate_speech,
inputs=[text_input, ref_audio_input, temp_slider, top_p_slider, style_slider],
outputs=[audio_output]
)
gr.Markdown(
"### ⚠️ Disclaimers\n"
"- Sopro can be inconsistent. If the output sounds glitchy, try tweaking the Temperature and Style Strength.\n"
"- Voice cloning quality is highly dependent on the microphone quality and ambient noise of the reference audio.\n"
"- Generation length is currently capped at ~32 seconds to prevent hallucinations."
)
if __name__ == "__main__":
demo.launch()