Soprano-80M / app.py
Nymbo's picture
Update app.py
2164beb verified
import gradio as gr
import io
import tempfile
import numpy as np
# Optional imports for Soprano TTS (lazy load)
try:
import torch # type: ignore
except Exception: # pragma: no cover
torch = None # type: ignore
try:
from soprano import SopranoTTS # type: ignore
except Exception: # pragma: no cover
SopranoTTS = None # type: ignore
try:
from scipy.io.wavfile import write as wav_write # type: ignore
except Exception: # pragma: no cover
wav_write = None # type: ignore
_SOPRANO_STATE = {"initialized": False, "device": "cpu", "model": None}
SAMPLE_RATE = 32000
def _init_soprano() -> None:
"""Initialize the Soprano model lazily. Requires CUDA GPU."""
if _SOPRANO_STATE["initialized"]:
return
if SopranoTTS is None:
raise gr.Error("Soprano is not installed. Please run: pip install soprano-tts --no-deps && pip install transformers unidecode")
if not torch or not torch.cuda.is_available():
raise gr.Error(
"Currently running on CPU. Soprano requires a GPU."
)
device = "cuda"
print(f"Using device: {device}")
# Use 'auto' backend: uses lmdeploy if available (faster), falls back to transformers
model = SopranoTTS(
backend="auto",
device=device,
)
_SOPRANO_STATE.update({"initialized": True, "device": device, "model": model})
def soprano_tts(
text: str,
temperature: float,
top_p: float,
repetition_penalty: float,
) -> tuple[int, np.ndarray] | None:
"""Generate speech from text using Soprano."""
if not text or not text.strip():
raise gr.Error("Please enter text to synthesize.")
_init_soprano()
model = _SOPRANO_STATE["model"]
try:
audio = model.infer(
text,
temperature=temperature,
top_p=top_p,
repetition_penalty=repetition_penalty,
)
# Model returns a tensor; convert to numpy
audio_np = audio.cpu().numpy()
return (SAMPLE_RATE, audio_np)
except gr.Error:
raise
except Exception as e:
raise gr.Error(f"Error during speech generation: {str(e)[:200]}...")
# --- Gradio UI ---
with gr.Blocks() as demo:
gr.HTML("<h1 style='text-align: center;'>Soprano-TTS</h1><p style='text-align: center;'>Powered by Soprano-80M | 32kHz High-Fidelity Audio</p>")
with gr.Row(variant="panel"):
temperature = gr.Slider(
minimum=0.0,
maximum=1.0,
value=0.3,
step=0.05,
label="Temperature",
info="Controls randomness. Lower = more deterministic.",
)
top_p = gr.Slider(
minimum=0.0,
maximum=1.0,
value=0.95,
step=0.01,
label="Top-P",
info="Nucleus sampling threshold.",
)
repetition_penalty = gr.Slider(
minimum=1.0,
maximum=2.0,
value=1.2,
step=0.05,
label="Repetition Penalty",
info="Penalizes repeated tokens.",
)
text_input = gr.Textbox(
label="Input Text",
placeholder="Enter the text you want to convert to speech here...",
value="Soprano is an extremely lightweight text to speech model designed to produce highly realistic speech at unprecedented speed.",
lines=5,
)
generate_btn = gr.Button(
"Generate Speech",
variant="primary",
)
audio_output = gr.Audio(
label="Generated Speech",
autoplay=True,
)
generate_inputs = [text_input, temperature, top_p, repetition_penalty]
generate_btn.click(
fn=soprano_tts,
inputs=generate_inputs,
outputs=audio_output,
api_name="generate_speech",
)
text_input.submit(
fn=soprano_tts,
inputs=generate_inputs,
outputs=audio_output,
api_name="generate_speech_enter",
)
if __name__ == "__main__":
demo.queue().launch(debug=True, theme="Nymbo/Nymbo_Theme")