Spaces:

WJBSCUT
/

VoiceDemo

Runtime error

VoiceDemo / app.py

jerrybwang

ba70a88 24 days ago

4.12 kB

	import gradio as gr
	import torch
	from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
	import numpy as np

	# Load Microsoft SpeechT5 model
	def load_model():
	"""Load the text-to-speech model"""
	processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
	model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts")
	vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan")
	return processor, model, vocoder

	# Text-to-speech function
	def text_to_speech(text, processor, model, vocoder):
	"""Convert text to speech using SpeechT5 model"""
	try:
	# Process the input text
	inputs = processor(text=text, return_tensors="pt")

	# Create a simple default speaker embedding (zeros vector)
	# This is a fallback when specific speaker embeddings are not available
	speaker_embeddings = torch.zeros((1, 512)) # Standard speaker embedding size

	# Generate speech using the correct method
	with torch.no_grad():
	# Generate audio directly using generate_speech with vocoder parameter
	speech = model.generate_speech(
	inputs["input_ids"],
	speaker_embeddings=speaker_embeddings,
	vocoder=vocoder
	)

	# Convert to numpy array and normalize
	speech = speech.cpu().numpy().squeeze()
	speech = speech / np.max(np.abs(speech)) * 0.8 # Normalize to prevent clipping

	return speech, 16000 # Return audio data and sample rate
	except Exception as e:
	raise gr.Error(f"Error generating speech: {str(e)}")

	# Main function
	def main():
	# Load model once at startup
	print("Loading Microsoft SpeechT5 model...")
	processor, model, vocoder = load_model()
	print("Model loaded successfully!")

	def generate_speech(text):
	"""Generate speech from text"""
	if not text.strip():
	return None, "Please enter some text to convert to speech."

	try:
	audio_data, sample_rate = text_to_speech(text, processor, model, vocoder)

	# Return audio file
	return (sample_rate, audio_data), f"Successfully generated speech for: '{text}'"
	except Exception as e:
	return None, f"Error: {str(e)}"

	# Create Gradio interface
	with gr.Blocks(title="Microsoft SpeechT5 Text-to-Speech") as demo:
	gr.Markdown("""
	# 🎤 Microsoft SpeechT5 Text-to-Speech

	Convert your text to natural-sounding speech using the Microsoft SpeechT5 model.
	""")

	with gr.Row():
	with gr.Column():
	text_input = gr.Textbox(
	label="Input Text",
	placeholder="Enter text you want to convert to speech...",
	lines=3,
	max_lines=10
	)
	generate_btn = gr.Button("Generate Speech", variant="primary")

	with gr.Column():
	audio_output = gr.Audio(label="Generated Speech", type="numpy")
	status_output = gr.Textbox(label="Status", interactive=False)

	# Examples
	gr.Examples(
	examples=[
	"Hello, welcome to the Microsoft SpeechT5 text-to-speech demo!",
	"The quick brown fox jumps over the lazy dog.",
	"Artificial intelligence is transforming the way we interact with technology.",
	"今天天气真好，适合出去散步。"
	],
	inputs=text_input
	)

	# Event handling
	generate_btn.click(
	fn=generate_speech,
	inputs=text_input,
	outputs=[audio_output, status_output]
	)

	text_input.submit(
	fn=generate_speech,
	inputs=text_input,
	outputs=[audio_output, status_output]
	)

	return demo

	if __name__ == "__main__":
	demo = main()
	demo.launch(share=False)