import gradio as gr import torch from soprano import SopranoTTS import numpy as np import socket import time import spaces # Detect device DEVICE = "cuda" if torch.cuda.is_available() else "cpu" model = None # Initialize model @spaces.GPU def load_model(): global model if model is None: model = SopranoTTS( backend="auto", device=DEVICE, cache_size_mb=100, decoder_batch_size=1, ) return model SAMPLE_RATE = 32000 @spaces.GPU def generate_speech( text: str, temperature: float = 0.3, top_p: float = 0.95, repetition_penalty: float = 1.2, ) -> tuple: """ Runs Soprano text-to-speech model with the given input text and sampling parameters. Returns: ((sr, audio), status) where sr is the sample rate (default 32000), audio is the output audio as an np.ndarray, and status is the displayed output text. """ if not text.strip(): return None, "Please enter some text to generate speech." print(text) try: model = load_model() start_time = time.perf_counter() audio = model.infer( text, temperature=temperature, top_p=top_p, repetition_penalty=repetition_penalty, ) gen_time = time.perf_counter() - start_time audio_np = audio.cpu().numpy() audio_int16 = (audio_np * 32767).astype(np.int16) audio_seconds = len(audio_np) / SAMPLE_RATE rtf = audio_seconds / gen_time if gen_time > 0 else float("inf") status = ( f"✓ Generated {audio_seconds:.2f} s audio | " f"Generation time: {gen_time:.3f} s " f"({rtf:.2f}x realtime)" ) return (SAMPLE_RATE, audio_int16), status except Exception as e: return None, f"✗ Error: {str(e)}" # Create Gradio interface with gr.Blocks(title="Soprano TTS") as demo: gr.Markdown( f""" # 🗣️ Soprano TTS **Running on: {DEVICE.upper()}** Soprano is an ultra-lightweight, open-source text-to-speech (TTS) model designed for real-time, high-fidelity speech synthesis at unprecedented speed. Soprano can achieve **<15 ms streaming latency** and up to **2000x real-time generation**, all while being easy to deploy at **<1 GB VRAM usage**. **GitHub:** https://github.com/ekwek1/soprano **Model Demo:** https://huggingface.co/spaces/ekwek/Soprano-TTS **Model Weights:** https://huggingface.co/ekwek/Soprano-80M """ ) with gr.Row(): with gr.Column(scale=2): text_input = gr.Textbox( label="Text to Synthesize", placeholder="Enter text here...", value="Soprano is an extremely lightweight text to speech model designed to produce highly realistic speech at unprecedented speed.", lines=5, max_lines=10, ) with gr.Accordion("Advanced Settings", open=False): temperature = gr.Slider( minimum=0.1, maximum=1.5, value=0.3, step=0.05, label="Temperature", ) top_p = gr.Slider( minimum=0.5, maximum=1.0, value=0.95, step=0.05, label="Top P", ) repetition_penalty = gr.Slider( minimum=1.0, maximum=2.0, value=1.2, step=0.1, label="Repetition Penalty", ) generate_btn = gr.Button("Generate Speech", variant="primary", size="lg") with gr.Column(scale=1): audio_output = gr.Audio( label="Generated Speech", type="numpy", autoplay=True, ) status_output = gr.Textbox( label="Status", interactive=False, lines=3, max_lines=10 ) gr.Examples( examples=[ ["Soprano is an extremely lightweight text to speech model.", 0.3, 0.95, 1.2], ["Hello! Welcome to Soprano text to speech.", 0.3, 0.95, 1.2], ["The quick brown fox jumps over the lazy dog.", 0.3, 0.95, 1.2], ["Artificial intelligence is transforming the world.", 0.5, 0.90, 1.2], ], inputs=[text_input, temperature, top_p, repetition_penalty], label="Example Prompts", ) generate_btn.click( fn=generate_speech, inputs=[text_input, temperature, top_p, repetition_penalty], outputs=[audio_output, status_output], ) gr.Markdown( f""" ### Usage tips: - Soprano works best when each sentence is between 2 and 15 seconds long. - Although Soprano recognizes numbers and some special characters, it occasionally mispronounces them. Best results can be achieved by converting these into their phonetic form. (1+1 -> one plus one, etc) - If Soprano produces unsatisfactory results, you can easily regenerate it for a new, potentially better generation. You may also change the sampling settings for more varied results. - Avoid improper grammar such as not using contractions, multiple spaces, etc. """ ) def main(): demo.launch( mcp_server=True, theme=gr.themes.Soft(primary_hue="green"), css=""" a { color: var(--primary-600); } a:hover { color: var(--primary-700); } """ ) if __name__ == "__main__": main()