Soprano-80M

Sleeping

App Files Files Community

Nymbo commited on Dec 22, 2025

Commit

9610d29

verified ·

1 Parent(s): f54b39e

Create app.py

Browse files

Files changed (1) hide show

app.py +180 -0

app.py ADDED Viewed

	@@ -0,0 +1,180 @@

+import gradio as gr
+import io
+import tempfile
+import numpy as np
+# Optional imports for Soprano TTS (lazy load)
+try:
+    import torch  # type: ignore
+except Exception:  # pragma: no cover
+    torch = None  # type: ignore
+try:
+    from soprano import SopranoTTS  # type: ignore
+except Exception:  # pragma: no cover
+    SopranoTTS = None  # type: ignore
+try:
+    from scipy.io.wavfile import write as wav_write  # type: ignore
+except Exception:  # pragma: no cover
+    wav_write = None  # type: ignore
+_SOPRANO_STATE = {"initialized": False, "device": "cpu", "model": None}
+SAMPLE_RATE = 32000
+def _init_soprano() -> None:
+    """Initialize the Soprano model lazily. Requires CUDA GPU."""
+    if _SOPRANO_STATE["initialized"]:
+        return
+    if SopranoTTS is None:
+        raise gr.Error("Soprano is not installed. Please run: pip install soprano-tts --no-deps && pip install transformers unidecode")
+    if not torch or not torch.cuda.is_available():
+        raise gr.Error(
+            "Soprano requires a CUDA GPU. PyTorch CUDA not detected. "
+            "Please install CUDA-enabled PyTorch: pip install torch --index-url https://download.pytorch.org/whl/cu121"
+        )
+    device = "cuda"
+    print(f"Using device: {device}")
+    # Use 'transformers' backend for compatibility (lmdeploy requires ray which isn't on Windows)
+    model = SopranoTTS(
+        backend="transformers",
+        device=device,
+    )
+    _SOPRANO_STATE.update({"initialized": True, "device": device, "model": model})
+def soprano_tts(
+    text: str,
+    temperature: float,
+    top_p: float,
+    repetition_penalty: float,
+) -> tuple[int, np.ndarray] | None:
+    """Generate speech from text using Soprano."""
+    if not text or not text.strip():
+        raise gr.Error("Please enter text to synthesize.")
+    _init_soprano()
+    model = _SOPRANO_STATE["model"]
+    try:
+        audio = model.infer(
+            text,
+            temperature=temperature,
+            top_p=top_p,
+            repetition_penalty=repetition_penalty,
+        )
+        # Model returns a tensor; convert to numpy
+        audio_np = audio.cpu().numpy()
+        return (SAMPLE_RATE, audio_np)
+    except gr.Error:
+        raise
+    except Exception as e:
+        raise gr.Error(f"Error during speech generation: {str(e)[:200]}...")
+def save_audio(audio_np: np.ndarray | None) -> str | None:
+    """Save audio to a temporary WAV file for download."""
+    if audio_np is None or len(audio_np) == 0:
+        return None
+    if wav_write is None:
+        raise gr.Error("scipy is not installed. Please run: pip install scipy")
+    import os
+    fd, path = tempfile.mkstemp(suffix=".wav")
+    os.close(fd)
+    wav_write(path, SAMPLE_RATE, audio_np)
+    return path
+# --- Gradio UI ---
+with gr.Blocks() as demo:
+    state_audio = gr.State(None)
+    gr.HTML("<h1 style='text-align: center;'>Soprano-TTS</h1><p style='text-align: center;'>Powered by Soprano-80M | 32kHz High-Fidelity Audio</p>")
+    gr.Markdown(
+        "**Usage tips:**\n"
+        "- Soprano works best when each sentence is between 2 and 15 seconds long.\n"
+        "- Convert numbers and special characters to phonetic form for best results (e.g., `1+1` → `one plus one`).\n"
+        "- If results are unsatisfactory, regenerate or adjust sampling settings.\n"
+        "- Avoid improper grammar such as missing contractions or multiple spaces."
+    )
+    with gr.Row(variant="panel"):
+        temperature = gr.Slider(
+            minimum=0.0,
+            maximum=1.0,
+            value=0.3,
+            step=0.05,
+            label="Temperature",
+            info="Controls randomness. Lower = more deterministic.",
+        )
+        top_p = gr.Slider(
+            minimum=0.0,
+            maximum=1.0,
+            value=0.95,
+            step=0.01,
+            label="Top-P",
+            info="Nucleus sampling threshold.",
+        )
+        repetition_penalty = gr.Slider(
+            minimum=1.0,
+            maximum=2.0,
+            value=1.2,
+            step=0.05,
+            label="Repetition Penalty",
+            info="Penalizes repeated tokens.",
+        )
+    text_input = gr.Textbox(
+        label="Input Text",
+        placeholder="Enter the text you want to convert to speech here...",
+        value="Soprano is an extremely lightweight text to speech model designed to produce highly realistic speech at unprecedented speed.",
+        lines=5,
+    )
+    generate_btn = gr.Button(
+        "Generate Speech",
+        variant="primary",
+    )
+    audio_output = gr.Audio(
+        label="Generated Speech",
+        autoplay=True,
+    )
+    download_btn = gr.Button("Download Audio")
+    download_file = gr.File(label="Download")
+    generate_inputs = [text_input, temperature, top_p, repetition_penalty]
+    def generate_and_store(text, temperature, top_p, repetition_penalty):
+        result = soprano_tts(text, temperature, top_p, repetition_penalty)
+        if result:
+            return result, result[1]  # Return audio tuple and numpy array for state
+        return None, None
+    generate_btn.click(
+        fn=generate_and_store,
+        inputs=generate_inputs,
+        outputs=[audio_output, state_audio],
+        api_name="generate_speech",
+    )
+    text_input.submit(
+        fn=generate_and_store,
+        inputs=generate_inputs,
+        outputs=[audio_output, state_audio],
+        api_name="generate_speech_enter",
+    )
+    download_btn.click(
+        fn=save_audio,
+        inputs=[state_audio],
+        outputs=[download_file],
+    )
+if __name__ == "__main__":
+    demo.queue().launch(debug=True, theme="Nymbo/Nymbo_Theme")