Soprano-RVC

Runtime error

File size: 3,354 Bytes

7e66c78
 
 
 
 
 
 
6137274
7e66c78
bbea18f
7e66c78
 
 
 
 
 
 
 
 
 
 
 
 
 
6137274
7e66c78
 
 
 
 
96b0eae
7e66c78
 
 
 
 
 
96b0eae
 
7e66c78
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bbea18f
7e66c78
 
 
 
 
 
 
 
 
bbea18f
7e66c78
 
 
 
 
 
 
 
 
 
 
 
 
 
96b0eae
7e66c78
bbea18f
 
7e66c78
7a56e38
f0ac9ac
 
 
7e66c78
 
f0ac9ac
7e66c78
 
 
 
 
 
bbea18f
 
 
 
 
7e66c78

import gradio as gr
import torch
import numpy as np
from soprano import SopranoTTS
from scipy.io.wavfile import write as wav_write
import tempfile
import os
import spaces

assert torch.cuda.is_available(), "Demo requires a GPU."
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(DEVICE)

# Load model once
model = SopranoTTS(
    backend="auto",
    device=DEVICE,
    cache_size_mb=100,
    decoder_batch_size=1,
)

SAMPLE_RATE = 32000


@spaces.GPU
def tts_stream(text, temperature, top_p, repetition_penalty, state):
    if not text.strip():
        yield None, state
        return

    out = model.infer(
        text,
        temperature=temperature,
        top_p=top_p,
        repetition_penalty=repetition_penalty,
    )

    audio_np = out.cpu().numpy()
    yield (SAMPLE_RATE, audio_np), audio_np


def save_audio(state):
    if state is None or len(state) == 0:
        return None
    fd, path = tempfile.mkstemp(suffix=".wav")
    os.close(fd)
    wav_write(path, SAMPLE_RATE, state)
    return path


with gr.Blocks() as demo:
    state_audio = gr.State(None)

    with gr.Row():
        with gr.Column():
            gr.Markdown("# Soprano Demo\n\nSoprano is an ultra‑lightweight, open‑source text‑to‑speech (TTS) model designed for real‑time, high‑fidelity speech synthesis at unprecedented speed. Soprano can achieve **<15 ms streaming latency** and up to **2000x real-time generation**, all while being easy to deploy at **<1 GB VRAM usage**.")

            text_in = gr.Textbox(
                label="Input Text",
                placeholder="Enter text to synthesize...",
                lines=4,
            )

            with gr.Accordion("Advanced options", open=False):
                temperature = gr.Slider(
                    0.0, 1.5, value=0.8, step=0.05, label="Temperature"
                )
                top_p = gr.Slider(
                    0.0, 1.0, value=0.95, step=0.01, label="Top-p"
                )
                repetition_penalty = gr.Slider(
                    0.5, 2.0, value=1.2, step=0.05, label="Repetition penalty"
                )

            gen_btn = gr.Button("Generate")

        with gr.Column():
            audio_out = gr.Audio(
                label="Output Audio",
                autoplay=True,
                streaming=False,
            )
            #download_btn = gr.Button("Download")
            #file_out = gr.File(label="Download file")
            gr.Markdown(
                "Usage tips:\n\n"
                "- Soprano works best when each sentence is between 2 and 15 seconds long.\n"
                "- Although Soprano recognizes numbers and some special characters, it occasionally mispronounces them. Best results can be achieved by converting these into their phonetic form. (1+1 -> one plus one, etc)\n"
                "- If Soprano produces unsatisfactory results, you can easily regenerate it for a new, potentially better generation. You may also change the sampling settings for more varied results."
            )


    gen_btn.click(
        fn=tts_stream,
        inputs=[text_in, temperature, top_p, repetition_penalty, state_audio],
        outputs=[audio_out, state_audio],
    )

    #download_btn.click(
    #    fn=save_audio,
    #    inputs=[state_audio],
    #    outputs=[file_out],
    #)

demo.queue()
demo.launch()