import gradio as gr import tempfile import torch from sopro import SoproTTS # Initialize the model globally so it only loads once when the Space starts device = "cuda" if torch.cuda.is_available() else "cpu" print(f"Loading Sopro TTS on device: {device}") tts = SoproTTS.from_pretrained("samuel-vitorino/sopro", device=device) def generate_speech(text, ref_audio_path, temperature, top_p, style_strength): if not text: raise gr.Error("Please enter some text to synthesize.") if not ref_audio_path: raise gr.Error("Please upload or record a reference audio file.") try: # Generate the audio wave wav = tts.synthesize( text=text, ref_audio_path=ref_audio_path, temperature=temperature, top_p=top_p, style_strength=style_strength ) # Save output to a temporary file for Gradio to serve temp_out = tempfile.NamedTemporaryFile(delete=False, suffix=".wav") tts.save_wav(temp_out.name, wav) return temp_out.name except Exception as e: raise gr.Error(f"Error during synthesis: {str(e)}") # Define the Gradio Interface with gr.Blocks(title="Sopro TTS - Voice Cloning") as demo: gr.Markdown("# 🌬️ Sopro TTS - Zero-Shot Voice Cloning") gr.Markdown( "A lightweight (135M parameter) text-to-speech model with zero-shot voice cloning by [Samuel Vitorino](https://github.com/samuel-vitorino/sopro). " "Upload a 3-12 second audio clip to clone a voice!" ) with gr.Row(): with gr.Column(): text_input = gr.Textbox( label="Text to Synthesize", lines=4, placeholder="Enter text here... (Prefer words over abbreviations/symbols, e.g., '1 plus 2' instead of '1 + 2')" ) # Type 'filepath' passes the path of the uploaded file to our function ref_audio_input = gr.Audio( label="Reference Audio (3 to 12 seconds recommended)", type="filepath", sources=["upload", "microphone"] ) with gr.Accordion("Advanced Parameters", open=False): temp_slider = gr.Slider(minimum=0.1, maximum=2.0, value=0.8, step=0.1, label="Temperature") top_p_slider = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top P") style_slider = gr.Slider(minimum=0.0, maximum=3.0, value=1.2, step=0.1, label="Style Strength", info="Controls the FiLM strength; increasing it can improve or reduce voice similarity.") generate_btn = gr.Button("Generate Speech", variant="primary") with gr.Column(): audio_output = gr.Audio(label="Generated Audio", autoplay=False) # Connect UI elements to the function generate_btn.click( fn=generate_speech, inputs=[text_input, ref_audio_input, temp_slider, top_p_slider, style_slider], outputs=[audio_output] ) gr.Markdown( "### ⚠️ Disclaimers\n" "- Sopro can be inconsistent. If the output sounds glitchy, try tweaking the Temperature and Style Strength.\n" "- Voice cloning quality is highly dependent on the microphone quality and ambient noise of the reference audio.\n" "- Generation length is currently capped at ~32 seconds to prevent hallucinations." ) if __name__ == "__main__": demo.launch()