Spaces:

ekwek
/

Soprano-TTS

Running on Zero

App Files Files Community

Soprano-TTS / app.py

ekwek

Update app.py

278ac29 verified 11 days ago

raw

history blame contribute delete

3.88 kB

	import gradio as gr
	import torch
	import numpy as np
	from soprano import SopranoTTS
	from scipy.io.wavfile import write as wav_write
	import tempfile
	import os
	import spaces

	assert torch.cuda.is_available(), "Demo requires a GPU."
	DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
	print(DEVICE)

	model = None

	def load_model():
	global model
	if model is None:
	# Load model once
	model = SopranoTTS(
	backend="auto",
	device=DEVICE,
	cache_size_mb=100,
	decoder_batch_size=1,
	)
	return model


	SAMPLE_RATE = 32000

	@spaces.GPU
	def tts_stream(text, temperature, top_p, repetition_penalty, state):
	model = load_model()

	if not text.strip():
	yield None, state
	return

	out = model.infer(
	text,
	temperature=temperature,
	top_p=top_p,
	repetition_penalty=repetition_penalty,
	)

	audio_np = out.cpu().numpy()
	yield (SAMPLE_RATE, audio_np), audio_np


	def save_audio(state):
	if state is None or len(state) == 0:
	return None
	fd, path = tempfile.mkstemp(suffix=".wav")
	os.close(fd)
	wav_write(path, SAMPLE_RATE, state)
	return path


	with gr.Blocks() as demo:
	state_audio = gr.State(None)

	with gr.Row():
	with gr.Column():
	gr.Markdown("# Soprano Demo\n\nSoprano is an ultra‑lightweight, open‑source text‑to‑speech (TTS) model designed for real‑time, high‑fidelity speech synthesis at unprecedented speed. Soprano can achieve <15 ms streaming latency and up to 2000x real-time generation, all while being easy to deploy at <1 GB VRAM usage.\n\nGithub: https://github.com/ekwek1/soprano\n\nModel Weights: https://huggingface.co/ekwek/Soprano-80M")

	text_in = gr.Textbox(
	label="Input Text",
	placeholder="Enter text to synthesize...",
	value="Soprano is an extremely lightweight text to speech model designed to produce highly realistic speech at unprecedented speed.",
	lines=4,
	)

	with gr.Accordion("Advanced options", open=False):
	temperature = gr.Slider(
	0.0, 1.0, value=0.3, step=0.05, label="Temperature"
	)
	top_p = gr.Slider(
	0.0, 1.0, value=0.95, step=0.01, label="Top-p"
	)
	repetition_penalty = gr.Slider(
	1.0, 2.0, value=1.2, step=0.05, label="Repetition penalty"
	)

	gen_btn = gr.Button("Generate")

	with gr.Column():
	audio_out = gr.Audio(
	label="Output Audio",
	autoplay=True,
	streaming=False,
	)
	#download_btn = gr.Button("Download")
	#file_out = gr.File(label="Download file")
	gr.Markdown(
	"Usage tips:\n\n"
	"- Soprano works best when each sentence is between 2 and 15 seconds long.\n"
	"- Although Soprano recognizes numbers and some special characters, it occasionally mispronounces them. Best results can be achieved by converting these into their phonetic form. (1+1 -> one plus one, etc)\n"
	"- If Soprano produces unsatisfactory results, you can easily regenerate it for a new, potentially better generation. You may also change the sampling settings for more varied results.\n"
	"- Avoid improper grammar such as not using contractions, multiple spaces, etc."
	)


	gen_btn.click(
	fn=tts_stream,
	inputs=[text_in, temperature, top_p, repetition_penalty, state_audio],
	outputs=[audio_out, state_audio],
	)

	#download_btn.click(
	# fn=save_audio,
	# inputs=[state_audio],
	# outputs=[file_out],
	#)

	demo.queue()
	demo.launch()