Spaces:

arjunbroepic
/

sopro-test

Running

App Files Files Community

sopro-test / app.py

arjunbroepic

Update app.py

1870bb0 verified about 1 month ago

raw

history blame contribute delete

3.55 kB

	import gradio as gr
	import tempfile
	import torch
	from sopro import SoproTTS

	# Initialize the model globally so it only loads once when the Space starts
	device = "cuda" if torch.cuda.is_available() else "cpu"
	print(f"Loading Sopro TTS on device: {device}")
	tts = SoproTTS.from_pretrained("samuel-vitorino/sopro", device=device)

	def generate_speech(text, ref_audio_path, temperature, top_p, style_strength):
	if not text:
	raise gr.Error("Please enter some text to synthesize.")
	if not ref_audio_path:
	raise gr.Error("Please upload or record a reference audio file.")

	try:
	# Generate the audio wave
	wav = tts.synthesize(
	text=text,
	ref_audio_path=ref_audio_path,
	temperature=temperature,
	top_p=top_p,
	style_strength=style_strength
	)

	# Save output to a temporary file for Gradio to serve
	temp_out = tempfile.NamedTemporaryFile(delete=False, suffix=".wav")
	tts.save_wav(temp_out.name, wav)

	return temp_out.name

	except Exception as e:
	raise gr.Error(f"Error during synthesis: {str(e)}")

	# Define the Gradio Interface
	with gr.Blocks(title="Sopro TTS - Voice Cloning") as demo:
	gr.Markdown("# 🌬️ Sopro TTS - Zero-Shot Voice Cloning")
	gr.Markdown(
	"A lightweight (135M parameter) text-to-speech model with zero-shot voice cloning by [Samuel Vitorino](https://github.com/samuel-vitorino/sopro). "
	"Upload a 3-12 second audio clip to clone a voice!"
	)

	with gr.Row():
	with gr.Column():
	text_input = gr.Textbox(
	label="Text to Synthesize",
	lines=4,
	placeholder="Enter text here... (Prefer words over abbreviations/symbols, e.g., '1 plus 2' instead of '1 + 2')"
	)

	# Type 'filepath' passes the path of the uploaded file to our function
	ref_audio_input = gr.Audio(
	label="Reference Audio (3 to 12 seconds recommended)",
	type="filepath",
	sources=["upload", "microphone"]
	)

	with gr.Accordion("Advanced Parameters", open=False):
	temp_slider = gr.Slider(minimum=0.1, maximum=2.0, value=0.8, step=0.1, label="Temperature")
	top_p_slider = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top P")
	style_slider = gr.Slider(minimum=0.0, maximum=3.0, value=1.2, step=0.1, label="Style Strength", info="Controls the FiLM strength; increasing it can improve or reduce voice similarity.")

	generate_btn = gr.Button("Generate Speech", variant="primary")

	with gr.Column():
	audio_output = gr.Audio(label="Generated Audio", autoplay=False)

	# Connect UI elements to the function
	generate_btn.click(
	fn=generate_speech,
	inputs=[text_input, ref_audio_input, temp_slider, top_p_slider, style_slider],
	outputs=[audio_output]
	)

	gr.Markdown(
	"### ⚠️ Disclaimers\n"
	"- Sopro can be inconsistent. If the output sounds glitchy, try tweaking the Temperature and Style Strength.\n"
	"- Voice cloning quality is highly dependent on the microphone quality and ambient noise of the reference audio.\n"
	"- Generation length is currently capped at ~32 seconds to prevent hallucinations."
	)

	if __name__ == "__main__":
	demo.launch()