File size: 2,914 Bytes
7e66c78
 
 
 
 
 
 
6137274
7e66c78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6137274
7e66c78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
import gradio as gr
import torch
import numpy as np
from soprano import SopranoTTS
from scipy.io.wavfile import write as wav_write
import tempfile
import os
import spaces

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(DEVICE)

# Load model once
model = SopranoTTS(
    backend="auto",
    device=DEVICE,
    cache_size_mb=100,
    decoder_batch_size=1,
)

SAMPLE_RATE = 32000


@spaces.GPU
def tts_stream(text, temperature, top_p, repetition_penalty, state):
    if not text.strip():
        yield None, state
        return

    chunks = []
    stream = model.infer_stream(
        text,
        chunk_size=1,
        temperature=temperature,
        top_p=top_p,
        repetition_penalty=repetition_penalty,
    )

    for chunk in stream:
        if isinstance(chunk, torch.Tensor):
            audio_np = chunk.detach().cpu().numpy().astype(np.float32)
            chunks.append(audio_np)
            # stream partial audio
            yield (SAMPLE_RATE, audio_np), np.concatenate(chunks)

    if chunks:
        final_audio = np.concatenate(chunks)
        yield (SAMPLE_RATE, final_audio), final_audio


def save_audio(state):
    if state is None or len(state) == 0:
        return None
    fd, path = tempfile.mkstemp(suffix=".wav")
    os.close(fd)
    wav_write(path, SAMPLE_RATE, state)
    return path


with gr.Blocks() as demo:
    state_audio = gr.State(None)

    with gr.Row():
        with gr.Column():
            gr.Markdown("## Soprano Demo")

            text_in = gr.Textbox(
                label="Input Text",
                placeholder="Enter text to synthesize...",
                lines=4,
            )

            with gr.Accordion("Advanced options", open=False):
                temperature = gr.Slider(
                    0.0, 1.0, value=0.3, step=0.05, label="Temperature"
                )
                top_p = gr.Slider(
                    0.0, 1.0, value=0.95, step=0.01, label="Top-p"
                )
                repetition_penalty = gr.Slider(
                    0.5, 2.0, value=1.2, step=0.05, label="Repetition penalty"
                )

            gen_btn = gr.Button("Generate")

        with gr.Column():
            audio_out = gr.Audio(
                label="Output Audio",
                autoplay=True,
                streaming=True,
            )
            download_btn = gr.Button("Download")
            file_out = gr.File(label="Download file")
            gr.Markdown(
                "Usage tips: (placeholder)\n\n"
                "- Tip 1\n"
                "- Tip 2\n"
                "- Tip 3"
            )

    gen_btn.click(
        fn=tts_stream,
        inputs=[text_in, temperature, top_p, repetition_penalty, state_audio],
        outputs=[audio_out, state_audio],
    )

    download_btn.click(
        fn=save_audio,
        inputs=[state_audio],
        outputs=[file_out],
    )

demo.queue()
demo.launch()