MassivelyMultilingualTTS

Build error

App Files Files Community

MassivelyMultilingualTTS / app.py

Mynameisju

Update app.py

f72ab49 verified 10 months ago

raw

history blame contribute delete

5.29 kB

	import torch
	torch.manual_seed(160923)

	import gradio as gr
	from huggingface_hub import hf_hub_download
	from InferenceInterfaces.ControllableInterface import ControllableInterface
	from Utility.utils import float2pcm, load_json_from_path

	import matplotlib.pyplot as plt
	import librosa
	import librosa.display
	import numpy as np
	import io
	from PIL import Image
	import threading

	def generate_spectrogram_image(wav, sr):
	fig, ax = plt.subplots(figsize=(4, 1.5))
	D = librosa.amplitude_to_db(librosa.stft(wav, n_fft=512), ref=np.max)
	img = librosa.display.specshow(D, sr=sr, x_axis='time', y_axis='log', ax=ax)
	fig.colorbar(img, ax=ax)
	buf = io.BytesIO()
	plt.savefig(buf, format='png', bbox_inches='tight', pad_inches=0.05)
	plt.close(fig)
	buf.seek(0)
	return Image.open(buf)

	class TTSWebUI:

	def __init__(self,
	gpu_id="cpu",
	title="Controllable Text-to-Speech for over 7000 Languages",
	article="Thank you Hugging Face 🤗 for the GPU!<br>More: https://github.com/DigitalPhonetics/IMS-Toucan",
	tts_model_path=None,
	vocoder_model_path=None,
	embedding_gan_path=None,
	available_artificial_voices=10):

	path_to_iso_list = hf_hub_download(repo_id="Flux9665/ToucanTTS", filename="iso_to_fullname.json")
	iso_to_name = load_json_from_path(path_to_iso_list)
	self.text_selection = [f"{iso_to_name[iso]} ({iso})" for iso in iso_to_name]

	self.controllable_ui = ControllableInterface(
	gpu_id=gpu_id,
	available_artificial_voices=available_artificial_voices,
	tts_model_path=tts_model_path,
	vocoder_model_path=vocoder_model_path,
	embedding_gan_path=embedding_gan_path
	)

	self.title = title
	self.article = article
	self.available_artificial_voices = available_artificial_voices

	def read(self,
	prompt,
	language,
	prosody_creativity,
	duration_scaling_factor,
	voice_seed,
	emb1,
	reference_audio):

	lang_code = language[-4:-1]
	result = [None]

	def run_tts():
	try:
	print("[INFO] Running TTS with prompt:", prompt)
	result[0] = self.controllable_ui.read(
	prompt,
	reference_audio,
	lang_code,
	lang_code,
	voice_seed,
	prosody_creativity,
	duration_scaling_factor,
	1.0, 1.0, 1.0,
	emb1,
	0.0, 0.0, 0.0, 0.0, 0.0,
	-24.0
	)
	except Exception as e:
	print("[ERROR] Exception during TTS:", e)
	result[0] = e

	thread = threading.Thread(target=run_tts)
	thread.start()
	thread.join() # ❗ Đã bỏ timeout để không bị cắt sớm trên CPU chậm

	if thread.is_alive():
	print("[WARNING] TTS thread still alive after join → Timeout logic (shouldn't happen now)")
	return None, generate_spectrogram_image(np.zeros(16000), 16000)

	if isinstance(result[0], Exception):
	print("[ERROR] TTS returned exception object:", result[0])
	raise result[0]

	if result[0] is None:
	print("[ERROR] TTS returned None — possible silent failure")
	return None, generate_spectrogram_image(np.zeros(16000), 16000)

	sr, wav, _ = result[0]

	print("[INFO] TTS success — sample rate:", sr, " \| waveform shape:", wav.shape)

	fig = generate_spectrogram_image(wav, sr)
	return (sr, float2pcm(wav)), fig

	def launch(self):
	gr.Interface(
	fn=self.read,
	inputs=[
	gr.Textbox(lines=2, placeholder="Type something...", value="What I cannot create, I do not understand.", label="Text input"),
	gr.Dropdown(self.text_selection, type="value", value='English (eng)', label="Select the Language"),
	gr.Slider(0.0, 0.8, step=0.1, value=0.5, label="Prosody Creativity"),
	gr.Slider(0.7, 1.3, step=0.1, value=1.0, label="Faster - Slower"),
	gr.Slider(0, self.available_artificial_voices, step=1, value=5, label="Random Voice Seed"),
	gr.Slider(-10.0, 10.0, step=0.1, value=0.0, label="Gender"),
	gr.Audio(type="filepath", show_label=True, container=True, label="[OPTIONAL] Voice to Clone"),
	],
	outputs=[
	gr.Audio(type="numpy", label="Speech"),
	gr.Image(label="Visualization")
	],
	title=self.title,
	allow_flagging="never",
	description=self.article,
	theme=gr.themes.Ocean(primary_hue="amber", secondary_hue="orange")
	).launch()

	if __name__ == '__main__':
	app = TTSWebUI(gpu_id="cpu")
	app.launch()