Soprano-80M

Sleeping

App Files Files Community

Soprano-80M / app.py

Nymbo

Update app.py

2164beb verified about 1 month ago

raw

history blame contribute delete

4.04 kB

	import gradio as gr
	import io
	import tempfile
	import numpy as np

	# Optional imports for Soprano TTS (lazy load)
	try:
	import torch # type: ignore
	except Exception: # pragma: no cover
	torch = None # type: ignore
	try:
	from soprano import SopranoTTS # type: ignore
	except Exception: # pragma: no cover
	SopranoTTS = None # type: ignore
	try:
	from scipy.io.wavfile import write as wav_write # type: ignore
	except Exception: # pragma: no cover
	wav_write = None # type: ignore

	_SOPRANO_STATE = {"initialized": False, "device": "cpu", "model": None}

	SAMPLE_RATE = 32000


	def _init_soprano() -> None:
	"""Initialize the Soprano model lazily. Requires CUDA GPU."""
	if _SOPRANO_STATE["initialized"]:
	return
	if SopranoTTS is None:
	raise gr.Error("Soprano is not installed. Please run: pip install soprano-tts --no-deps && pip install transformers unidecode")

	if not torch or not torch.cuda.is_available():
	raise gr.Error(
	"Currently running on CPU. Soprano requires a GPU."
	)

	device = "cuda"
	print(f"Using device: {device}")

	# Use 'auto' backend: uses lmdeploy if available (faster), falls back to transformers
	model = SopranoTTS(
	backend="auto",
	device=device,
	)
	_SOPRANO_STATE.update({"initialized": True, "device": device, "model": model})


	def soprano_tts(
	text: str,
	temperature: float,
	top_p: float,
	repetition_penalty: float,
	) -> tuple[int, np.ndarray] \| None:
	"""Generate speech from text using Soprano."""
	if not text or not text.strip():
	raise gr.Error("Please enter text to synthesize.")

	_init_soprano()
	model = _SOPRANO_STATE["model"]

	try:
	audio = model.infer(
	text,
	temperature=temperature,
	top_p=top_p,
	repetition_penalty=repetition_penalty,
	)
	# Model returns a tensor; convert to numpy
	audio_np = audio.cpu().numpy()
	return (SAMPLE_RATE, audio_np)
	except gr.Error:
	raise
	except Exception as e:
	raise gr.Error(f"Error during speech generation: {str(e)[:200]}...")


	# --- Gradio UI ---
	with gr.Blocks() as demo:
	gr.HTML("<h1 style='text-align: center;'>Soprano-TTS</h1><p style='text-align: center;'>Powered by Soprano-80M \| 32kHz High-Fidelity Audio</p>")

	with gr.Row(variant="panel"):
	temperature = gr.Slider(
	minimum=0.0,
	maximum=1.0,
	value=0.3,
	step=0.05,
	label="Temperature",
	info="Controls randomness. Lower = more deterministic.",
	)
	top_p = gr.Slider(
	minimum=0.0,
	maximum=1.0,
	value=0.95,
	step=0.01,
	label="Top-P",
	info="Nucleus sampling threshold.",
	)
	repetition_penalty = gr.Slider(
	minimum=1.0,
	maximum=2.0,
	value=1.2,
	step=0.05,
	label="Repetition Penalty",
	info="Penalizes repeated tokens.",
	)

	text_input = gr.Textbox(
	label="Input Text",
	placeholder="Enter the text you want to convert to speech here...",
	value="Soprano is an extremely lightweight text to speech model designed to produce highly realistic speech at unprecedented speed.",
	lines=5,
	)

	generate_btn = gr.Button(
	"Generate Speech",
	variant="primary",
	)

	audio_output = gr.Audio(
	label="Generated Speech",
	autoplay=True,
	)

	generate_inputs = [text_input, temperature, top_p, repetition_penalty]

	generate_btn.click(
	fn=soprano_tts,
	inputs=generate_inputs,
	outputs=audio_output,
	api_name="generate_speech",
	)

	text_input.submit(
	fn=soprano_tts,
	inputs=generate_inputs,
	outputs=audio_output,
	api_name="generate_speech_enter",
	)

	if __name__ == "__main__":
	demo.queue().launch(debug=True, theme="Nymbo/Nymbo_Theme")