File size: 4,039 Bytes
9610d29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5904dfc
9610d29
 
 
 
 
5904dfc
9610d29
5904dfc
9610d29
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2164beb
9610d29
2164beb
9610d29
 
 
 
2164beb
9610d29
2164beb
9610d29
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
import gradio as gr
import io
import tempfile
import numpy as np

# Optional imports for Soprano TTS (lazy load)
try:
    import torch  # type: ignore
except Exception:  # pragma: no cover
    torch = None  # type: ignore
try:
    from soprano import SopranoTTS  # type: ignore
except Exception:  # pragma: no cover
    SopranoTTS = None  # type: ignore
try:
    from scipy.io.wavfile import write as wav_write  # type: ignore
except Exception:  # pragma: no cover
    wav_write = None  # type: ignore

_SOPRANO_STATE = {"initialized": False, "device": "cpu", "model": None}

SAMPLE_RATE = 32000


def _init_soprano() -> None:
    """Initialize the Soprano model lazily. Requires CUDA GPU."""
    if _SOPRANO_STATE["initialized"]:
        return
    if SopranoTTS is None:
        raise gr.Error("Soprano is not installed. Please run: pip install soprano-tts --no-deps && pip install transformers unidecode")

    if not torch or not torch.cuda.is_available():
        raise gr.Error(
            "Currently running on CPU. Soprano requires a GPU."
        )

    device = "cuda"
    print(f"Using device: {device}")

    # Use 'auto' backend: uses lmdeploy if available (faster), falls back to transformers
    model = SopranoTTS(
        backend="auto",
        device=device,
    )
    _SOPRANO_STATE.update({"initialized": True, "device": device, "model": model})


def soprano_tts(
    text: str,
    temperature: float,
    top_p: float,
    repetition_penalty: float,
) -> tuple[int, np.ndarray] | None:
    """Generate speech from text using Soprano."""
    if not text or not text.strip():
        raise gr.Error("Please enter text to synthesize.")

    _init_soprano()
    model = _SOPRANO_STATE["model"]

    try:
        audio = model.infer(
            text,
            temperature=temperature,
            top_p=top_p,
            repetition_penalty=repetition_penalty,
        )
        # Model returns a tensor; convert to numpy
        audio_np = audio.cpu().numpy()
        return (SAMPLE_RATE, audio_np)
    except gr.Error:
        raise
    except Exception as e:
        raise gr.Error(f"Error during speech generation: {str(e)[:200]}...")


# --- Gradio UI ---
with gr.Blocks() as demo:
    gr.HTML("<h1 style='text-align: center;'>Soprano-TTS</h1><p style='text-align: center;'>Powered by Soprano-80M | 32kHz High-Fidelity Audio</p>")

    with gr.Row(variant="panel"):
        temperature = gr.Slider(
            minimum=0.0,
            maximum=1.0,
            value=0.3,
            step=0.05,
            label="Temperature",
            info="Controls randomness. Lower = more deterministic.",
        )
        top_p = gr.Slider(
            minimum=0.0,
            maximum=1.0,
            value=0.95,
            step=0.01,
            label="Top-P",
            info="Nucleus sampling threshold.",
        )
        repetition_penalty = gr.Slider(
            minimum=1.0,
            maximum=2.0,
            value=1.2,
            step=0.05,
            label="Repetition Penalty",
            info="Penalizes repeated tokens.",
        )

    text_input = gr.Textbox(
        label="Input Text",
        placeholder="Enter the text you want to convert to speech here...",
        value="Soprano is an extremely lightweight text to speech model designed to produce highly realistic speech at unprecedented speed.",
        lines=5,
    )

    generate_btn = gr.Button(
        "Generate Speech",
        variant="primary",
    )

    audio_output = gr.Audio(
        label="Generated Speech",
        autoplay=True,
    )

    generate_inputs = [text_input, temperature, top_p, repetition_penalty]

    generate_btn.click(
        fn=soprano_tts,
        inputs=generate_inputs,
        outputs=audio_output,
        api_name="generate_speech",
    )

    text_input.submit(
        fn=soprano_tts,
        inputs=generate_inputs,
        outputs=audio_output,
        api_name="generate_speech_enter",
    )

if __name__ == "__main__":
    demo.queue().launch(debug=True, theme="Nymbo/Nymbo_Theme")