File size: 3,354 Bytes
7e66c78
 
 
 
 
 
 
6137274
7e66c78
bbea18f
7e66c78
 
 
 
 
 
 
 
 
 
 
 
 
 
6137274
7e66c78
 
 
 
 
96b0eae
7e66c78
 
 
 
 
 
96b0eae
 
7e66c78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bbea18f
7e66c78
 
 
 
 
 
 
 
 
bbea18f
7e66c78
 
 
 
 
 
 
 
 
 
 
 
 
 
96b0eae
7e66c78
bbea18f
 
7e66c78
7a56e38
f0ac9ac
 
 
7e66c78
 
f0ac9ac
7e66c78
 
 
 
 
 
bbea18f
 
 
 
 
7e66c78
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
import gradio as gr
import torch
import numpy as np
from soprano import SopranoTTS
from scipy.io.wavfile import write as wav_write
import tempfile
import os
import spaces

assert torch.cuda.is_available(), "Demo requires a GPU."
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(DEVICE)

# Load model once
model = SopranoTTS(
    backend="auto",
    device=DEVICE,
    cache_size_mb=100,
    decoder_batch_size=1,
)

SAMPLE_RATE = 32000


@spaces.GPU
def tts_stream(text, temperature, top_p, repetition_penalty, state):
    if not text.strip():
        yield None, state
        return

    out = model.infer(
        text,
        temperature=temperature,
        top_p=top_p,
        repetition_penalty=repetition_penalty,
    )

    audio_np = out.cpu().numpy()
    yield (SAMPLE_RATE, audio_np), audio_np


def save_audio(state):
    if state is None or len(state) == 0:
        return None
    fd, path = tempfile.mkstemp(suffix=".wav")
    os.close(fd)
    wav_write(path, SAMPLE_RATE, state)
    return path


with gr.Blocks() as demo:
    state_audio = gr.State(None)

    with gr.Row():
        with gr.Column():
            gr.Markdown("# Soprano Demo\n\nSoprano is an ultra‑lightweight, open‑source text‑to‑speech (TTS) model designed for real‑time, high‑fidelity speech synthesis at unprecedented speed. Soprano can achieve **<15 ms streaming latency** and up to **2000x real-time generation**, all while being easy to deploy at **<1 GB VRAM usage**.")

            text_in = gr.Textbox(
                label="Input Text",
                placeholder="Enter text to synthesize...",
                lines=4,
            )

            with gr.Accordion("Advanced options", open=False):
                temperature = gr.Slider(
                    0.0, 1.5, value=0.8, step=0.05, label="Temperature"
                )
                top_p = gr.Slider(
                    0.0, 1.0, value=0.95, step=0.01, label="Top-p"
                )
                repetition_penalty = gr.Slider(
                    0.5, 2.0, value=1.2, step=0.05, label="Repetition penalty"
                )

            gen_btn = gr.Button("Generate")

        with gr.Column():
            audio_out = gr.Audio(
                label="Output Audio",
                autoplay=True,
                streaming=False,
            )
            #download_btn = gr.Button("Download")
            #file_out = gr.File(label="Download file")
            gr.Markdown(
                "Usage tips:\n\n"
                "- Soprano works best when each sentence is between 2 and 15 seconds long.\n"
                "- Although Soprano recognizes numbers and some special characters, it occasionally mispronounces them. Best results can be achieved by converting these into their phonetic form. (1+1 -> one plus one, etc)\n"
                "- If Soprano produces unsatisfactory results, you can easily regenerate it for a new, potentially better generation. You may also change the sampling settings for more varied results."
            )


    gen_btn.click(
        fn=tts_stream,
        inputs=[text_in, temperature, top_p, repetition_penalty, state_audio],
        outputs=[audio_out, state_audio],
    )

    #download_btn.click(
    #    fn=save_audio,
    #    inputs=[state_audio],
    #    outputs=[file_out],
    #)

demo.queue()
demo.launch()