Spaces:

phatdo
/

Frysk-TTS

Running on Zero

App Files Files Community

Frysk-TTS / app.py

phatdo

Update app.py

c5ac27d verified 9 months ago

raw

history blame contribute delete

8.08 kB

	import spaces

	import gradio as gr
	import numpy as np

	import os, subprocess, time, torch, yaml, re
	import synthesize

	from pandas import describe_option
	from scipy.io.wavfile import read
	from utils.model import get_model_infer, get_vocoder

	gr.close_all()

	title = "Open-source and open-access Frisian TTS<br>(from Phat Do's PhD research)"
	description = """
	<center>
	<img src='/gradio_api/file=assets/Friesland.png' width=300px><br>
	These are the prototype Frisian synthetic speech models intended to showcase the findings of my PhD research, which are described in Section 7.6 of the dissertation.<br>
	The single-speaker model was trained on only <strong>20 minutes</strong> of data (recordings) using transfer learning from Dutch data. The multi-speaker model was trained from scratch on <strong>32 hours</strong> of Frisian data (from <a href='https://commonvoice.mozilla.org/fy-NL' target='_blank'>Mozilla Common Voice</a>).<br><br>
	Please select the model's tab above, enter any Frisian text (or choose from the examples for convenience), select a speaker ID below (for the multi-speaker model), and click on Submit to synthesize the speech.<br>
	</center>
	"""
	article = "<center>This is a basic demo version intended to accompany my PhD dissertation, showcasing the effects of several findings included in the research (please see the dissertation for details).<br> Due to hosting constraints, the model may be rather slow in synthesizing. Thank you for your patience!<br> Please reach out to <a href='https://www.rug.nl/staff/t.p.do/' target='_blank'>Phat Do</a> or email me at <a href='mailto:t.p.do@rug.nl' target='_blank'>t.p.do@rug.nl</a> if you are interested in knowing more!</center><br><br>To phonetically transcribe the input Frisian text, this work gratefully uses the G2P Frysk model and script kindly provided by the Fryske Akademy:<br> Heeringa, Wilbert & Drenth, Eduard & Van de Velde, Hans (2024). G2P Frysk [computer program]. Retrieved 4 July 2024 from <a href='https://www.fa.knaw.nl/fa-apps/graph2phon/' target='_blank'>https://www.fa.knaw.nl/fa-apps/graph2phon/</a>.<br><br>The Frisian training data is extracted from Mozilla Common Voice and the Dutch data from CSS10:<br>Ardila, R., Branson, M., Davis, K., Kohler, M., Meyer, J., Henretty, M., Morais, R., Saunders, L., Tyers, F., & Weber, G. (2020). <a href='https://commonvoice.mozilla.org/fy-NL' target='_blank'>Common Voice</a>: A Massively-Multilingual Speech Corpus. Proceedings of the 12th LREC, 4218–4222. <br>Park, K., & Mulc, T. (2019). CSS10: A collection of single speaker speech datasets for 10 languages. Proc. Interspeech 2019, 1566–1570. <a href='https://doi.org/10.21437/Interspeech.2019-1500' target='_blank'>https://doi.org/10.21437/Interspeech.2019-1500</a>."

	css = """
	h1 {
	text-align: center;
	display:block;
	}
	"""

	def load_models(config_path, model_name, device = "cpu"):
	# getting configs
	preprocess_config = yaml.load(open(os.path.join(os.getcwd(), config_path + "preprocess.yaml"), "r"), Loader=yaml.FullLoader)
	model_config = yaml.load(open(os.path.join(os.getcwd(), config_path + "model.yaml"), "r"), Loader=yaml.FullLoader)
	train_config = yaml.load(open(os.path.join(os.getcwd(), config_path + "train.yaml"), "r"), Loader=yaml.FullLoader)
	configs = (preprocess_config, model_config, train_config)
	# loading models
	model = get_model_infer(os.path.join(os.getcwd(), "assets/model/" + model_name), configs, device)
	vocoder = get_vocoder(model_config, device)
	return configs, model, vocoder

	configs_single, model_single, vocoder_single = load_models(config_path = "config/Frysk_CV_speaker_29/",
	model_name = "Frysk_CV_speaker_29_350000.pth.tar")
	configs_multi, model_multi, vocoder_multi = load_models(config_path = "config/Frysk_CV/",
	model_name = "Frysk_CV_300000.pth.tar")

	@spaces.GPU(duration=20)
	def infer(text, speaker_ID, configs, model, vocoder):
	model.to('cuda')
	vocoder.to('cuda')
	speakers = np.array([int(speaker_ID)])
	texts = np.array([synthesize.preprocess_frysk(text, configs[0])])
	text_lens = np.array([len(texts[0])])

	ids = raw_texts = [text[:100]]
	batchs = [(ids, raw_texts, speakers, texts, text_lens, max(text_lens))]

	synthesize.synthesize(model, "step", configs, vocoder, batchs, (1.0, 1.0, 1.0))

	while True:
	if not os.path.exists(os.path.join(os.getcwd(), 'assets/audio/result.wav')):
	time.sleep(0.01)
	else:
	sr, audio = read(os.path.join(os.getcwd(), 'assets/audio/result.wav'))
	break
	return sr, audio

	# main inference function
	@spaces.GPU(duration=20)
	def run_single(text):
	speaker_ID = 0
	output = infer(text, 0, configs_single, model_single, vocoder_single)
	return output

	@spaces.GPU(duration=20)
	def run_multi(text, speaker_ID):
	dic = {1: 0, 2: 11, 3: 22, 4: 24, 5: 25, 6: 26, 7: 27, 8: 28, 9: 29, 10: 1, 11: 2, 12: 3, 13: 4, 14: 5, 15: 6, 16: 7, 17: 8, 18: 9, 19: 10, 20: 12, 21: 13, 22: 14, 23: 15, 24: 16, 25: 17, 26: 18, 27: 19, 28: 20, 29: 21, 30: 23}
	output = infer(text, dic[speaker_ID], configs_multi, model_multi, vocoder_multi)
	return output

	iface_single = gr.Interface(fn=run_single,
	inputs=[
	gr.components.Textbox(lines=3, placeholder="Please input Frisian text to synthesize", label='Text to synthesize'),
	],
	outputs=gr.components.Audio(type="numpy", label='Synthesized speech'),
	title=title,
	description=description,
	article=article,
	css=css,
	theme='huggingface',
	examples=[
	["Praat mar Frysk!", 0],
	["Bûter, brea en griene tsiis, wa’t dat net sizze kin, is gjin oprjochte Fries!", 0],
	["In lyk man is in ryk man.", 0],
	["As pake it net meitsje kin, dan slagget it gjinien!", 0],
	["As it net kin sa’t it moat, dan moat it mar sa’t it kin.", 0],
	["Elk sprekt fan myn sûpen, mar nimmen wit fan myn toarst!", 0],
	["Pikerje net it komt dochs oars.", 0]
	],
	)

	iface_multi = gr.Interface(fn=run_multi,
	inputs=[
	gr.components.Textbox(lines=3, placeholder="Please input Frisian text to synthesize", label='Text to synthesize'),
	gr.components.Slider(minimum=1, maximum=30, step=1, label="Speaker ID (1 to 30)")
	],
	outputs=gr.components.Audio(type="numpy", label='Synthesized speech'),
	title=title,
	description=description,
	article=article,
	css=css,
	theme='huggingface',
	examples=[
	["Praat mar Frysk!", 1],
	["Bûter, brea en griene tsiis, wa’t dat net sizze kin, is gjin oprjochte Fries!", 5],
	["In lyk man is in ryk man.", 11],
	["As pake it net meitsje kin, dan slagget it gjinien!", 17],
	["As it net kin sa’t it moat, dan moat it mar sa’t it kin.", 20],
	["Elk sprekt fan myn sûpen, mar nimmen wit fan myn toarst!", 25],
	["Pikerje net it komt dochs oars.", 29]
	],
	)

	demo = gr.TabbedInterface([iface_single, iface_multi], ["Single-speaker (transfer learning)", "Multi-speaker (trained from scratch)"])

	if __name__ == "__main__":
	demo.launch(share=False, inline=False, server_name="0.0.0.0", server_port=7860, allowed_paths=["/", "./assets"], ssr_mode=False)