File size: 2,901 Bytes
7e66c78 4cba748 7e66c78 4cba748 7e66c78 4cba748 7e66c78 4cba748 7e66c78 4cba748 7e66c78 4cba748 7e66c78 4cba748 96b0eae 7e66c78 4cba748 96b0eae 7e66c78 4cba748 7e66c78 4cba748 7e66c78 4cba748 8117494 ec73fc9 4cba748 7e66c78 15fba4b 7e66c78 4cba748 7e66c78 504450b 7e66c78 bf7fc52 7e66c78 4cba748 7e66c78 4cba748 7e66c78 96b0eae 7e66c78 4cba748 7e66c78 15fba4b 7e66c78 4cba748 7e66c78 4cba748 7e66c78 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 | import gradio as gr
import torch
import numpy as np
from soprano import SopranoTTS
from scipy.io.wavfile import write as wav_write
import tempfile
import os
# Detect device
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {DEVICE}")
# Load model once - works on both CUDA and CPU
model = SopranoTTS(
backend="auto", # Will automatically choose best backend for device
device=DEVICE,
cache_size_mb=100, # Only relevant for CUDA
decoder_batch_size=1,
)
SAMPLE_RATE = 32000
# Remove @spaces.GPU decorator - not needed for CPU support
def tts_stream(text, temperature, top_p, repetition_penalty, state):
if not text.strip():
yield None, state
return
out = model.infer(
text,
temperature=temperature,
top_p=top_p,
repetition_penalty=repetition_penalty,
)
audio_np = out.cpu().numpy()
yield (SAMPLE_RATE, audio_np), audio_np
def save_audio(state):
if state is None or len(state) == 0:
return None
fd, path = tempfile.mkstemp(suffix=".wav")
os.close(fd)
wav_write(path, SAMPLE_RATE, state)
return path
with gr.Blocks() as demo:
state_audio = gr.State(None)
with gr.Row():
with gr.Column():
gr.Markdown(
f"# SevenLabs"
)
text_in = gr.Textbox(
label="Input Text",
placeholder="Enter text to synthesize...",
value="SevenLabs is an extremely lightweight text to speech model designed to produce highly realistic speech at unprecedented speed.",
lines=4,
)
with gr.Accordion("Advanced options", open=False):
temperature = gr.Slider(
0.0, 1.0, value=1, step=0.05, label="Temperature"
)
top_p = gr.Slider(
0.0, 1.0, value=0.95, step=0.01, label="Top-p"
)
repetition_penalty = gr.Slider(
1.0, 2.0, value=1.2, step=0.05, label="Repetition penalty"
)
gen_btn = gr.Button("Generate")
with gr.Column():
audio_out = gr.Audio(
label="Output Audio",
autoplay=True,
streaming=False,
)
download_btn = gr.Button("Download")
file_out = gr.File(label="Download file")
gr.Markdown(
)
gen_btn.click(
fn=tts_stream,
inputs=[text_in, temperature, top_p, repetition_penalty, state_audio],
outputs=[audio_out, state_audio],
)
download_btn.click(
fn=save_audio,
inputs=[state_audio],
outputs=[file_out],
)
demo.queue()
demo.launch() |