SevenLabs

Running

File size: 2,901 Bytes

7e66c78
 
 
 
 
 
 
 
4cba748
7e66c78
4cba748
7e66c78
4cba748
7e66c78
4cba748
7e66c78
4cba748
7e66c78
 
 
 
 
4cba748
7e66c78
 
 
 
4cba748
96b0eae
7e66c78
 
 
 
 
4cba748
96b0eae
 
7e66c78
 
 
 
4cba748
7e66c78
 
 
 
 
 
 
4cba748
7e66c78
 
4cba748
8117494
ec73fc9
4cba748
 
7e66c78
 
 
15fba4b
7e66c78
 
4cba748
7e66c78
 
504450b
7e66c78
 
 
 
 
bf7fc52
7e66c78
4cba748
7e66c78
4cba748
7e66c78
 
 
 
96b0eae
7e66c78
4cba748
 
 
 
7e66c78
15fba4b
7e66c78
4cba748
7e66c78
 
 
 
 
4cba748
 
 
 
 
 
7e66c78

import gradio as gr
import torch
import numpy as np
from soprano import SopranoTTS
from scipy.io.wavfile import write as wav_write
import tempfile
import os

# Detect device
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")

# Load model once - works on both CUDA and CPU
model = SopranoTTS(
    backend="auto",  # Will automatically choose best backend for device
    device=DEVICE,
    cache_size_mb=100,  # Only relevant for CUDA
    decoder_batch_size=1,
)

SAMPLE_RATE = 32000

# Remove @spaces.GPU decorator - not needed for CPU support
def tts_stream(text, temperature, top_p, repetition_penalty, state):
    if not text.strip():
        yield None, state
        return
    
    out = model.infer(
        text,
        temperature=temperature,
        top_p=top_p,
        repetition_penalty=repetition_penalty,
    )
    
    audio_np = out.cpu().numpy()
    yield (SAMPLE_RATE, audio_np), audio_np

def save_audio(state):
    if state is None or len(state) == 0:
        return None
    
    fd, path = tempfile.mkstemp(suffix=".wav")
    os.close(fd)
    wav_write(path, SAMPLE_RATE, state)
    return path

with gr.Blocks() as demo:
    state_audio = gr.State(None)
    
    with gr.Row():
        with gr.Column():
            gr.Markdown(
                f"# SevenLabs"
                
            )
            
            text_in = gr.Textbox(
                label="Input Text",
                placeholder="Enter text to synthesize...",
                value="SevenLabs is an extremely lightweight text to speech model designed to produce highly realistic speech at unprecedented speed.",
                lines=4,
            )
            
            with gr.Accordion("Advanced options", open=False):
                temperature = gr.Slider(
                    0.0, 1.0, value=1, step=0.05, label="Temperature"
                )
                top_p = gr.Slider(
                    0.0, 1.0, value=0.95, step=0.01, label="Top-p"
                )
                repetition_penalty = gr.Slider(
                    1.0, 2.0, value=1.2, step=0.05, label="Repetition penalty"
                )
            
            gen_btn = gr.Button("Generate")
        
        with gr.Column():
            audio_out = gr.Audio(
                label="Output Audio",
                autoplay=True,
                streaming=False,
            )
            
            download_btn = gr.Button("Download")
            file_out = gr.File(label="Download file")
            
            gr.Markdown(
               
            )
    
    gen_btn.click(
        fn=tts_stream,
        inputs=[text_in, temperature, top_p, repetition_penalty, state_audio],
        outputs=[audio_out, state_audio],
    )
    
    download_btn.click(
        fn=save_audio,
        inputs=[state_audio],
        outputs=[file_out],
    )

demo.queue()
demo.launch()