Spaces:
Sleeping
Sleeping
| import gradio as gr | |
| import io | |
| import tempfile | |
| import numpy as np | |
| # Optional imports for Soprano TTS (lazy load) | |
| try: | |
| import torch # type: ignore | |
| except Exception: # pragma: no cover | |
| torch = None # type: ignore | |
| try: | |
| from soprano import SopranoTTS # type: ignore | |
| except Exception: # pragma: no cover | |
| SopranoTTS = None # type: ignore | |
| try: | |
| from scipy.io.wavfile import write as wav_write # type: ignore | |
| except Exception: # pragma: no cover | |
| wav_write = None # type: ignore | |
| _SOPRANO_STATE = {"initialized": False, "device": "cpu", "model": None} | |
| SAMPLE_RATE = 32000 | |
| def _init_soprano() -> None: | |
| """Initialize the Soprano model lazily. Requires CUDA GPU.""" | |
| if _SOPRANO_STATE["initialized"]: | |
| return | |
| if SopranoTTS is None: | |
| raise gr.Error("Soprano is not installed. Please run: pip install soprano-tts --no-deps && pip install transformers unidecode") | |
| if not torch or not torch.cuda.is_available(): | |
| raise gr.Error( | |
| "Currently running on CPU. Soprano requires a GPU." | |
| ) | |
| device = "cuda" | |
| print(f"Using device: {device}") | |
| # Use 'auto' backend: uses lmdeploy if available (faster), falls back to transformers | |
| model = SopranoTTS( | |
| backend="auto", | |
| device=device, | |
| ) | |
| _SOPRANO_STATE.update({"initialized": True, "device": device, "model": model}) | |
| def soprano_tts( | |
| text: str, | |
| temperature: float, | |
| top_p: float, | |
| repetition_penalty: float, | |
| ) -> tuple[int, np.ndarray] | None: | |
| """Generate speech from text using Soprano.""" | |
| if not text or not text.strip(): | |
| raise gr.Error("Please enter text to synthesize.") | |
| _init_soprano() | |
| model = _SOPRANO_STATE["model"] | |
| try: | |
| audio = model.infer( | |
| text, | |
| temperature=temperature, | |
| top_p=top_p, | |
| repetition_penalty=repetition_penalty, | |
| ) | |
| # Model returns a tensor; convert to numpy | |
| audio_np = audio.cpu().numpy() | |
| return (SAMPLE_RATE, audio_np) | |
| except gr.Error: | |
| raise | |
| except Exception as e: | |
| raise gr.Error(f"Error during speech generation: {str(e)[:200]}...") | |
| # --- Gradio UI --- | |
| with gr.Blocks() as demo: | |
| gr.HTML("<h1 style='text-align: center;'>Soprano-TTS</h1><p style='text-align: center;'>Powered by Soprano-80M | 32kHz High-Fidelity Audio</p>") | |
| with gr.Row(variant="panel"): | |
| temperature = gr.Slider( | |
| minimum=0.0, | |
| maximum=1.0, | |
| value=0.3, | |
| step=0.05, | |
| label="Temperature", | |
| info="Controls randomness. Lower = more deterministic.", | |
| ) | |
| top_p = gr.Slider( | |
| minimum=0.0, | |
| maximum=1.0, | |
| value=0.95, | |
| step=0.01, | |
| label="Top-P", | |
| info="Nucleus sampling threshold.", | |
| ) | |
| repetition_penalty = gr.Slider( | |
| minimum=1.0, | |
| maximum=2.0, | |
| value=1.2, | |
| step=0.05, | |
| label="Repetition Penalty", | |
| info="Penalizes repeated tokens.", | |
| ) | |
| text_input = gr.Textbox( | |
| label="Input Text", | |
| placeholder="Enter the text you want to convert to speech here...", | |
| value="Soprano is an extremely lightweight text to speech model designed to produce highly realistic speech at unprecedented speed.", | |
| lines=5, | |
| ) | |
| generate_btn = gr.Button( | |
| "Generate Speech", | |
| variant="primary", | |
| ) | |
| audio_output = gr.Audio( | |
| label="Generated Speech", | |
| autoplay=True, | |
| ) | |
| generate_inputs = [text_input, temperature, top_p, repetition_penalty] | |
| generate_btn.click( | |
| fn=soprano_tts, | |
| inputs=generate_inputs, | |
| outputs=audio_output, | |
| api_name="generate_speech", | |
| ) | |
| text_input.submit( | |
| fn=soprano_tts, | |
| inputs=generate_inputs, | |
| outputs=audio_output, | |
| api_name="generate_speech_enter", | |
| ) | |
| if __name__ == "__main__": | |
| demo.queue().launch(debug=True, theme="Nymbo/Nymbo_Theme") | |