import gradio as gr import torch import numpy as np from soprano import SopranoTTS from scipy.io.wavfile import write as wav_write import tempfile import os import spaces DEVICE = "cuda" if torch.cuda.is_available() else "cpu" print(DEVICE) # Load model once model = SopranoTTS( backend="auto", device=DEVICE, cache_size_mb=100, decoder_batch_size=1, ) SAMPLE_RATE = 32000 @spaces.GPU def tts_stream(text, temperature, top_p, repetition_penalty, state): if not text.strip(): yield None, state return chunks = [] stream = model.infer_stream( text, chunk_size=20, temperature=temperature, top_p=top_p, repetition_penalty=repetition_penalty, ) for chunk in stream: if isinstance(chunk, torch.Tensor): audio_np = chunk.detach().cpu().numpy().astype(np.float32) chunks.append(audio_np) # stream partial audio yield (SAMPLE_RATE, audio_np), np.concatenate(chunks) #if chunks: # final_audio = np.concatenate(chunks) # yield (SAMPLE_RATE, final_audio), final_audio def save_audio(state): if state is None or len(state) == 0: return None fd, path = tempfile.mkstemp(suffix=".wav") os.close(fd) wav_write(path, SAMPLE_RATE, state) return path with gr.Blocks() as demo: state_audio = gr.State(None) with gr.Row(): with gr.Column(): gr.Markdown("# Soprano Demo") text_in = gr.Textbox( label="Input Text", placeholder="Enter text to synthesize...", lines=4, ) with gr.Accordion("Advanced options", open=False): temperature = gr.Slider( 0.0, 1.0, value=0.3, step=0.05, label="Temperature" ) top_p = gr.Slider( 0.0, 1.0, value=0.95, step=0.01, label="Top-p" ) repetition_penalty = gr.Slider( 0.5, 2.0, value=1.2, step=0.05, label="Repetition penalty" ) gen_btn = gr.Button("Generate") with gr.Column(): audio_out = gr.Audio( label="Output Audio", autoplay=True, streaming=True, ) download_btn = gr.Button("Download") file_out = gr.File(label="Download file") gr.Markdown( "Usage tips:\n\n" "- Soprano works best when each sentence is between 2 and 15 seconds long.\n" "- Although Soprano recognizes numbers and some special characters, it occasionally mispronounces them. Best results can be achieved by converting these into their phonetic form. (1+1 -> one plus one, etc)\n" "- If Soprano produces unsatisfactory results, you can easily regenerate it for a new, potentially better generation. You may also change the sampling settings for more varied results." ) gen_btn.click( fn=tts_stream, inputs=[text_in, temperature, top_p, repetition_penalty, state_audio], outputs=[audio_out, state_audio], ) download_btn.click( fn=save_audio, inputs=[state_audio], outputs=[file_out], ) demo.queue() demo.launch()