import gradio as gr import io import tempfile import numpy as np # Optional imports for Soprano TTS (lazy load) try: import torch # type: ignore except Exception: # pragma: no cover torch = None # type: ignore try: from soprano import SopranoTTS # type: ignore except Exception: # pragma: no cover SopranoTTS = None # type: ignore try: from scipy.io.wavfile import write as wav_write # type: ignore except Exception: # pragma: no cover wav_write = None # type: ignore _SOPRANO_STATE = {"initialized": False, "device": "cpu", "model": None} SAMPLE_RATE = 32000 def _init_soprano() -> None: """Initialize the Soprano model lazily. Requires CUDA GPU.""" if _SOPRANO_STATE["initialized"]: return if SopranoTTS is None: raise gr.Error("Soprano is not installed. Please run: pip install soprano-tts --no-deps && pip install transformers unidecode") if not torch or not torch.cuda.is_available(): raise gr.Error( "Currently running on CPU. Soprano requires a GPU." ) device = "cuda" print(f"Using device: {device}") # Use 'auto' backend: uses lmdeploy if available (faster), falls back to transformers model = SopranoTTS( backend="auto", device=device, ) _SOPRANO_STATE.update({"initialized": True, "device": device, "model": model}) def soprano_tts( text: str, temperature: float, top_p: float, repetition_penalty: float, ) -> tuple[int, np.ndarray] | None: """Generate speech from text using Soprano.""" if not text or not text.strip(): raise gr.Error("Please enter text to synthesize.") _init_soprano() model = _SOPRANO_STATE["model"] try: audio = model.infer( text, temperature=temperature, top_p=top_p, repetition_penalty=repetition_penalty, ) # Model returns a tensor; convert to numpy audio_np = audio.cpu().numpy() return (SAMPLE_RATE, audio_np) except gr.Error: raise except Exception as e: raise gr.Error(f"Error during speech generation: {str(e)[:200]}...") # --- Gradio UI --- with gr.Blocks() as demo: gr.HTML("
Powered by Soprano-80M | 32kHz High-Fidelity Audio
") with gr.Row(variant="panel"): temperature = gr.Slider( minimum=0.0, maximum=1.0, value=0.3, step=0.05, label="Temperature", info="Controls randomness. Lower = more deterministic.", ) top_p = gr.Slider( minimum=0.0, maximum=1.0, value=0.95, step=0.01, label="Top-P", info="Nucleus sampling threshold.", ) repetition_penalty = gr.Slider( minimum=1.0, maximum=2.0, value=1.2, step=0.05, label="Repetition Penalty", info="Penalizes repeated tokens.", ) text_input = gr.Textbox( label="Input Text", placeholder="Enter the text you want to convert to speech here...", value="Soprano is an extremely lightweight text to speech model designed to produce highly realistic speech at unprecedented speed.", lines=5, ) generate_btn = gr.Button( "Generate Speech", variant="primary", ) audio_output = gr.Audio( label="Generated Speech", autoplay=True, ) generate_inputs = [text_input, temperature, top_p, repetition_penalty] generate_btn.click( fn=soprano_tts, inputs=generate_inputs, outputs=audio_output, api_name="generate_speech", ) text_input.submit( fn=soprano_tts, inputs=generate_inputs, outputs=audio_output, api_name="generate_speech_enter", ) if __name__ == "__main__": demo.queue().launch(debug=True, theme="Nymbo/Nymbo_Theme")