import spaces import gradio as gr import numpy as np import os, subprocess, time, torch, yaml, re import synthesize from pandas import describe_option from scipy.io.wavfile import read from utils.model import get_model_infer, get_vocoder gr.close_all() title = "Open-source and open-access Frisian TTS
(from Phat Do's PhD research)" description = """

These are the prototype Frisian synthetic speech models intended to showcase the findings of my PhD research, which are described in Section 7.6 of the dissertation.
The single-speaker model was trained on only 20 minutes of data (recordings) using transfer learning from Dutch data. The multi-speaker model was trained from scratch on 32 hours of Frisian data (from Mozilla Common Voice).

Please select the model's tab above, enter any Frisian text (or choose from the examples for convenience), select a speaker ID below (for the multi-speaker model), and click on Submit to synthesize the speech.
""" article = "
This is a basic demo version intended to accompany my PhD dissertation, showcasing the effects of several findings included in the research (please see the dissertation for details).
Due to hosting constraints, the model may be rather slow in synthesizing. Thank you for your patience!
Please reach out to Phat Do or email me at t.p.do@rug.nl if you are interested in knowing more!


To phonetically transcribe the input Frisian text, this work gratefully uses the G2P Frysk model and script kindly provided by the Fryske Akademy:
Heeringa, Wilbert & Drenth, Eduard & Van de Velde, Hans (2024). G2P Frysk [computer program]. Retrieved 4 July 2024 from https://www.fa.knaw.nl/fa-apps/graph2phon/.

The Frisian training data is extracted from Mozilla Common Voice and the Dutch data from CSS10:
Ardila, R., Branson, M., Davis, K., Kohler, M., Meyer, J., Henretty, M., Morais, R., Saunders, L., Tyers, F., & Weber, G. (2020). Common Voice: A Massively-Multilingual Speech Corpus. Proceedings of the 12th LREC, 4218–4222.
Park, K., & Mulc, T. (2019). CSS10: A collection of single speaker speech datasets for 10 languages. Proc. Interspeech 2019, 1566–1570. https://doi.org/10.21437/Interspeech.2019-1500." css = """ h1 { text-align: center; display:block; } """ def load_models(config_path, model_name, device = "cpu"): # getting configs preprocess_config = yaml.load(open(os.path.join(os.getcwd(), config_path + "preprocess.yaml"), "r"), Loader=yaml.FullLoader) model_config = yaml.load(open(os.path.join(os.getcwd(), config_path + "model.yaml"), "r"), Loader=yaml.FullLoader) train_config = yaml.load(open(os.path.join(os.getcwd(), config_path + "train.yaml"), "r"), Loader=yaml.FullLoader) configs = (preprocess_config, model_config, train_config) # loading models model = get_model_infer(os.path.join(os.getcwd(), "assets/model/" + model_name), configs, device) vocoder = get_vocoder(model_config, device) return configs, model, vocoder configs_single, model_single, vocoder_single = load_models(config_path = "config/Frysk_CV_speaker_29/", model_name = "Frysk_CV_speaker_29_350000.pth.tar") configs_multi, model_multi, vocoder_multi = load_models(config_path = "config/Frysk_CV/", model_name = "Frysk_CV_300000.pth.tar") @spaces.GPU(duration=20) def infer(text, speaker_ID, configs, model, vocoder): model.to('cuda') vocoder.to('cuda') speakers = np.array([int(speaker_ID)]) texts = np.array([synthesize.preprocess_frysk(text, configs[0])]) text_lens = np.array([len(texts[0])]) ids = raw_texts = [text[:100]] batchs = [(ids, raw_texts, speakers, texts, text_lens, max(text_lens))] synthesize.synthesize(model, "step", configs, vocoder, batchs, (1.0, 1.0, 1.0)) while True: if not os.path.exists(os.path.join(os.getcwd(), 'assets/audio/result.wav')): time.sleep(0.01) else: sr, audio = read(os.path.join(os.getcwd(), 'assets/audio/result.wav')) break return sr, audio # main inference function @spaces.GPU(duration=20) def run_single(text): speaker_ID = 0 output = infer(text, 0, configs_single, model_single, vocoder_single) return output @spaces.GPU(duration=20) def run_multi(text, speaker_ID): dic = {1: 0, 2: 11, 3: 22, 4: 24, 5: 25, 6: 26, 7: 27, 8: 28, 9: 29, 10: 1, 11: 2, 12: 3, 13: 4, 14: 5, 15: 6, 16: 7, 17: 8, 18: 9, 19: 10, 20: 12, 21: 13, 22: 14, 23: 15, 24: 16, 25: 17, 26: 18, 27: 19, 28: 20, 29: 21, 30: 23} output = infer(text, dic[speaker_ID], configs_multi, model_multi, vocoder_multi) return output iface_single = gr.Interface(fn=run_single, inputs=[ gr.components.Textbox(lines=3, placeholder="Please input Frisian text to synthesize", label='Text to synthesize'), ], outputs=gr.components.Audio(type="numpy", label='Synthesized speech'), title=title, description=description, article=article, css=css, theme='huggingface', examples=[ ["Praat mar Frysk!", 0], ["Bûter, brea en griene tsiis, wa’t dat net sizze kin, is gjin oprjochte Fries!", 0], ["In lyk man is in ryk man.", 0], ["As pake it net meitsje kin, dan slagget it gjinien!", 0], ["As it net kin sa’t it moat, dan moat it mar sa’t it kin.", 0], ["Elk sprekt fan myn sûpen, mar nimmen wit fan myn toarst!", 0], ["Pikerje net it komt dochs oars.", 0] ], ) iface_multi = gr.Interface(fn=run_multi, inputs=[ gr.components.Textbox(lines=3, placeholder="Please input Frisian text to synthesize", label='Text to synthesize'), gr.components.Slider(minimum=1, maximum=30, step=1, label="Speaker ID (1 to 30)") ], outputs=gr.components.Audio(type="numpy", label='Synthesized speech'), title=title, description=description, article=article, css=css, theme='huggingface', examples=[ ["Praat mar Frysk!", 1], ["Bûter, brea en griene tsiis, wa’t dat net sizze kin, is gjin oprjochte Fries!", 5], ["In lyk man is in ryk man.", 11], ["As pake it net meitsje kin, dan slagget it gjinien!", 17], ["As it net kin sa’t it moat, dan moat it mar sa’t it kin.", 20], ["Elk sprekt fan myn sûpen, mar nimmen wit fan myn toarst!", 25], ["Pikerje net it komt dochs oars.", 29] ], ) demo = gr.TabbedInterface([iface_single, iface_multi], ["Single-speaker (transfer learning)", "Multi-speaker (trained from scratch)"]) if __name__ == "__main__": demo.launch(share=False, inline=False, server_name="0.0.0.0", server_port=7860, allowed_paths=["/", "./assets"], ssr_mode=False)