Frysk-TTS / app.py
phatdo's picture
Update app.py
c5ac27d verified
import spaces
import gradio as gr
import numpy as np
import os, subprocess, time, torch, yaml, re
import synthesize
from pandas import describe_option
from scipy.io.wavfile import read
from utils.model import get_model_infer, get_vocoder
gr.close_all()
title = "Open-source and open-access Frisian TTS<br>(from Phat Do's PhD research)"
description = """
<center>
<img src='/gradio_api/file=assets/Friesland.png' width=300px><br>
These are the prototype Frisian synthetic speech models intended to showcase the findings of my PhD research, which are described in Section 7.6 of the dissertation.<br>
The single-speaker model was trained on only <strong>20 minutes</strong> of data (recordings) using transfer learning from Dutch data. The multi-speaker model was trained from scratch on <strong>32 hours</strong> of Frisian data (from <a href='https://commonvoice.mozilla.org/fy-NL' target='_blank'>Mozilla Common Voice</a>).<br><br>
Please select the model's tab above, enter any Frisian text (or choose from the examples for convenience), select a speaker ID below (for the multi-speaker model), and click on Submit to synthesize the speech.<br>
</center>
"""
article = "<center>This is a basic demo version intended to accompany my PhD dissertation, showcasing the effects of several findings included in the research (please see the dissertation for details).<br> Due to hosting constraints, the model may be rather slow in synthesizing. Thank you for your patience!<br> Please reach out to <a href='https://www.rug.nl/staff/t.p.do/' target='_blank'>Phat Do</a> or email me at <a href='mailto:t.p.do@rug.nl' target='_blank'>t.p.do@rug.nl</a> if you are interested in knowing more!</center><br><br>To phonetically transcribe the input Frisian text, this work gratefully uses the G2P Frysk model and script kindly provided by the Fryske Akademy:<br> Heeringa, Wilbert & Drenth, Eduard & Van de Velde, Hans (2024). G2P Frysk [computer program]. Retrieved 4 July 2024 from <a href='https://www.fa.knaw.nl/fa-apps/graph2phon/' target='_blank'>https://www.fa.knaw.nl/fa-apps/graph2phon/</a>.<br><br>The Frisian training data is extracted from Mozilla Common Voice and the Dutch data from CSS10:<br>Ardila, R., Branson, M., Davis, K., Kohler, M., Meyer, J., Henretty, M., Morais, R., Saunders, L., Tyers, F., & Weber, G. (2020). <a href='https://commonvoice.mozilla.org/fy-NL' target='_blank'>Common Voice</a>: A Massively-Multilingual Speech Corpus. Proceedings of the 12th LREC, 4218–4222. <br>Park, K., & Mulc, T. (2019). CSS10: A collection of single speaker speech datasets for 10 languages. Proc. Interspeech 2019, 1566–1570. <a href='https://doi.org/10.21437/Interspeech.2019-1500' target='_blank'>https://doi.org/10.21437/Interspeech.2019-1500</a>."
css = """
h1 {
text-align: center;
display:block;
}
"""
def load_models(config_path, model_name, device = "cpu"):
# getting configs
preprocess_config = yaml.load(open(os.path.join(os.getcwd(), config_path + "preprocess.yaml"), "r"), Loader=yaml.FullLoader)
model_config = yaml.load(open(os.path.join(os.getcwd(), config_path + "model.yaml"), "r"), Loader=yaml.FullLoader)
train_config = yaml.load(open(os.path.join(os.getcwd(), config_path + "train.yaml"), "r"), Loader=yaml.FullLoader)
configs = (preprocess_config, model_config, train_config)
# loading models
model = get_model_infer(os.path.join(os.getcwd(), "assets/model/" + model_name), configs, device)
vocoder = get_vocoder(model_config, device)
return configs, model, vocoder
configs_single, model_single, vocoder_single = load_models(config_path = "config/Frysk_CV_speaker_29/",
model_name = "Frysk_CV_speaker_29_350000.pth.tar")
configs_multi, model_multi, vocoder_multi = load_models(config_path = "config/Frysk_CV/",
model_name = "Frysk_CV_300000.pth.tar")
@spaces.GPU(duration=20)
def infer(text, speaker_ID, configs, model, vocoder):
model.to('cuda')
vocoder.to('cuda')
speakers = np.array([int(speaker_ID)])
texts = np.array([synthesize.preprocess_frysk(text, configs[0])])
text_lens = np.array([len(texts[0])])
ids = raw_texts = [text[:100]]
batchs = [(ids, raw_texts, speakers, texts, text_lens, max(text_lens))]
synthesize.synthesize(model, "step", configs, vocoder, batchs, (1.0, 1.0, 1.0))
while True:
if not os.path.exists(os.path.join(os.getcwd(), 'assets/audio/result.wav')):
time.sleep(0.01)
else:
sr, audio = read(os.path.join(os.getcwd(), 'assets/audio/result.wav'))
break
return sr, audio
# main inference function
@spaces.GPU(duration=20)
def run_single(text):
speaker_ID = 0
output = infer(text, 0, configs_single, model_single, vocoder_single)
return output
@spaces.GPU(duration=20)
def run_multi(text, speaker_ID):
dic = {1: 0, 2: 11, 3: 22, 4: 24, 5: 25, 6: 26, 7: 27, 8: 28, 9: 29, 10: 1, 11: 2, 12: 3, 13: 4, 14: 5, 15: 6, 16: 7, 17: 8, 18: 9, 19: 10, 20: 12, 21: 13, 22: 14, 23: 15, 24: 16, 25: 17, 26: 18, 27: 19, 28: 20, 29: 21, 30: 23}
output = infer(text, dic[speaker_ID], configs_multi, model_multi, vocoder_multi)
return output
iface_single = gr.Interface(fn=run_single,
inputs=[
gr.components.Textbox(lines=3, placeholder="Please input Frisian text to synthesize", label='Text to synthesize'),
],
outputs=gr.components.Audio(type="numpy", label='Synthesized speech'),
title=title,
description=description,
article=article,
css=css,
theme='huggingface',
examples=[
["Praat mar Frysk!", 0],
["Bûter, brea en griene tsiis, wa’t dat net sizze kin, is gjin oprjochte Fries!", 0],
["In lyk man is in ryk man.", 0],
["As pake it net meitsje kin, dan slagget it gjinien!", 0],
["As it net kin sa’t it moat, dan moat it mar sa’t it kin.", 0],
["Elk sprekt fan myn sûpen, mar nimmen wit fan myn toarst!", 0],
["Pikerje net it komt dochs oars.", 0]
],
)
iface_multi = gr.Interface(fn=run_multi,
inputs=[
gr.components.Textbox(lines=3, placeholder="Please input Frisian text to synthesize", label='Text to synthesize'),
gr.components.Slider(minimum=1, maximum=30, step=1, label="Speaker ID (1 to 30)")
],
outputs=gr.components.Audio(type="numpy", label='Synthesized speech'),
title=title,
description=description,
article=article,
css=css,
theme='huggingface',
examples=[
["Praat mar Frysk!", 1],
["Bûter, brea en griene tsiis, wa’t dat net sizze kin, is gjin oprjochte Fries!", 5],
["In lyk man is in ryk man.", 11],
["As pake it net meitsje kin, dan slagget it gjinien!", 17],
["As it net kin sa’t it moat, dan moat it mar sa’t it kin.", 20],
["Elk sprekt fan myn sûpen, mar nimmen wit fan myn toarst!", 25],
["Pikerje net it komt dochs oars.", 29]
],
)
demo = gr.TabbedInterface([iface_single, iface_multi], ["Single-speaker (transfer learning)", "Multi-speaker (trained from scratch)"])
if __name__ == "__main__":
demo.launch(share=False, inline=False, server_name="0.0.0.0", server_port=7860, allowed_paths=["/", "./assets"], ssr_mode=False)