Spaces:
Running
Running
| import gradio as gr | |
| import tempfile | |
| import torch | |
| from sopro import SoproTTS | |
| # Initialize the model globally so it only loads once when the Space starts | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| print(f"Loading Sopro TTS on device: {device}") | |
| tts = SoproTTS.from_pretrained("samuel-vitorino/sopro", device=device) | |
| def generate_speech(text, ref_audio_path, temperature, top_p, style_strength): | |
| if not text: | |
| raise gr.Error("Please enter some text to synthesize.") | |
| if not ref_audio_path: | |
| raise gr.Error("Please upload or record a reference audio file.") | |
| try: | |
| # Generate the audio wave | |
| wav = tts.synthesize( | |
| text=text, | |
| ref_audio_path=ref_audio_path, | |
| temperature=temperature, | |
| top_p=top_p, | |
| style_strength=style_strength | |
| ) | |
| # Save output to a temporary file for Gradio to serve | |
| temp_out = tempfile.NamedTemporaryFile(delete=False, suffix=".wav") | |
| tts.save_wav(temp_out.name, wav) | |
| return temp_out.name | |
| except Exception as e: | |
| raise gr.Error(f"Error during synthesis: {str(e)}") | |
| # Define the Gradio Interface | |
| with gr.Blocks(title="Sopro TTS - Voice Cloning") as demo: | |
| gr.Markdown("# 🌬️ Sopro TTS - Zero-Shot Voice Cloning") | |
| gr.Markdown( | |
| "A lightweight (135M parameter) text-to-speech model with zero-shot voice cloning by [Samuel Vitorino](https://github.com/samuel-vitorino/sopro). " | |
| "Upload a 3-12 second audio clip to clone a voice!" | |
| ) | |
| with gr.Row(): | |
| with gr.Column(): | |
| text_input = gr.Textbox( | |
| label="Text to Synthesize", | |
| lines=4, | |
| placeholder="Enter text here... (Prefer words over abbreviations/symbols, e.g., '1 plus 2' instead of '1 + 2')" | |
| ) | |
| # Type 'filepath' passes the path of the uploaded file to our function | |
| ref_audio_input = gr.Audio( | |
| label="Reference Audio (3 to 12 seconds recommended)", | |
| type="filepath", | |
| sources=["upload", "microphone"] | |
| ) | |
| with gr.Accordion("Advanced Parameters", open=False): | |
| temp_slider = gr.Slider(minimum=0.1, maximum=2.0, value=0.8, step=0.1, label="Temperature") | |
| top_p_slider = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top P") | |
| style_slider = gr.Slider(minimum=0.0, maximum=3.0, value=1.2, step=0.1, label="Style Strength", info="Controls the FiLM strength; increasing it can improve or reduce voice similarity.") | |
| generate_btn = gr.Button("Generate Speech", variant="primary") | |
| with gr.Column(): | |
| audio_output = gr.Audio(label="Generated Audio", autoplay=False) | |
| # Connect UI elements to the function | |
| generate_btn.click( | |
| fn=generate_speech, | |
| inputs=[text_input, ref_audio_input, temp_slider, top_p_slider, style_slider], | |
| outputs=[audio_output] | |
| ) | |
| gr.Markdown( | |
| "### ⚠️ Disclaimers\n" | |
| "- Sopro can be inconsistent. If the output sounds glitchy, try tweaking the Temperature and Style Strength.\n" | |
| "- Voice cloning quality is highly dependent on the microphone quality and ambient noise of the reference audio.\n" | |
| "- Generation length is currently capped at ~32 seconds to prevent hallucinations." | |
| ) | |
| if __name__ == "__main__": | |
| demo.launch() |