Spaces:
Sleeping
Sleeping
File size: 4,039 Bytes
9610d29 5904dfc 9610d29 5904dfc 9610d29 5904dfc 9610d29 2164beb 9610d29 2164beb 9610d29 2164beb 9610d29 2164beb 9610d29 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 |
import gradio as gr
import io
import tempfile
import numpy as np
# Optional imports for Soprano TTS (lazy load)
try:
import torch # type: ignore
except Exception: # pragma: no cover
torch = None # type: ignore
try:
from soprano import SopranoTTS # type: ignore
except Exception: # pragma: no cover
SopranoTTS = None # type: ignore
try:
from scipy.io.wavfile import write as wav_write # type: ignore
except Exception: # pragma: no cover
wav_write = None # type: ignore
_SOPRANO_STATE = {"initialized": False, "device": "cpu", "model": None}
SAMPLE_RATE = 32000
def _init_soprano() -> None:
"""Initialize the Soprano model lazily. Requires CUDA GPU."""
if _SOPRANO_STATE["initialized"]:
return
if SopranoTTS is None:
raise gr.Error("Soprano is not installed. Please run: pip install soprano-tts --no-deps && pip install transformers unidecode")
if not torch or not torch.cuda.is_available():
raise gr.Error(
"Currently running on CPU. Soprano requires a GPU."
)
device = "cuda"
print(f"Using device: {device}")
# Use 'auto' backend: uses lmdeploy if available (faster), falls back to transformers
model = SopranoTTS(
backend="auto",
device=device,
)
_SOPRANO_STATE.update({"initialized": True, "device": device, "model": model})
def soprano_tts(
text: str,
temperature: float,
top_p: float,
repetition_penalty: float,
) -> tuple[int, np.ndarray] | None:
"""Generate speech from text using Soprano."""
if not text or not text.strip():
raise gr.Error("Please enter text to synthesize.")
_init_soprano()
model = _SOPRANO_STATE["model"]
try:
audio = model.infer(
text,
temperature=temperature,
top_p=top_p,
repetition_penalty=repetition_penalty,
)
# Model returns a tensor; convert to numpy
audio_np = audio.cpu().numpy()
return (SAMPLE_RATE, audio_np)
except gr.Error:
raise
except Exception as e:
raise gr.Error(f"Error during speech generation: {str(e)[:200]}...")
# --- Gradio UI ---
with gr.Blocks() as demo:
gr.HTML("<h1 style='text-align: center;'>Soprano-TTS</h1><p style='text-align: center;'>Powered by Soprano-80M | 32kHz High-Fidelity Audio</p>")
with gr.Row(variant="panel"):
temperature = gr.Slider(
minimum=0.0,
maximum=1.0,
value=0.3,
step=0.05,
label="Temperature",
info="Controls randomness. Lower = more deterministic.",
)
top_p = gr.Slider(
minimum=0.0,
maximum=1.0,
value=0.95,
step=0.01,
label="Top-P",
info="Nucleus sampling threshold.",
)
repetition_penalty = gr.Slider(
minimum=1.0,
maximum=2.0,
value=1.2,
step=0.05,
label="Repetition Penalty",
info="Penalizes repeated tokens.",
)
text_input = gr.Textbox(
label="Input Text",
placeholder="Enter the text you want to convert to speech here...",
value="Soprano is an extremely lightweight text to speech model designed to produce highly realistic speech at unprecedented speed.",
lines=5,
)
generate_btn = gr.Button(
"Generate Speech",
variant="primary",
)
audio_output = gr.Audio(
label="Generated Speech",
autoplay=True,
)
generate_inputs = [text_input, temperature, top_p, repetition_penalty]
generate_btn.click(
fn=soprano_tts,
inputs=generate_inputs,
outputs=audio_output,
api_name="generate_speech",
)
text_input.submit(
fn=soprano_tts,
inputs=generate_inputs,
outputs=audio_output,
api_name="generate_speech_enter",
)
if __name__ == "__main__":
demo.queue().launch(debug=True, theme="Nymbo/Nymbo_Theme")
|