Spaces:
Running
on
Zero
Running
on
Zero
| import spaces | |
| import gradio as gr | |
| import numpy as np | |
| import os, subprocess, time, torch, yaml, re | |
| import synthesize | |
| from pandas import describe_option | |
| from scipy.io.wavfile import read | |
| from utils.model import get_model_infer, get_vocoder | |
| gr.close_all() | |
| title = "Open-source and open-access Frisian TTS<br>(from Phat Do's PhD research)" | |
| description = """ | |
| <center> | |
| <img src='/gradio_api/file=assets/Friesland.png' width=300px><br> | |
| These are the prototype Frisian synthetic speech models intended to showcase the findings of my PhD research, which are described in Section 7.6 of the dissertation.<br> | |
| The single-speaker model was trained on only <strong>20 minutes</strong> of data (recordings) using transfer learning from Dutch data. The multi-speaker model was trained from scratch on <strong>32 hours</strong> of Frisian data (from <a href='https://commonvoice.mozilla.org/fy-NL' target='_blank'>Mozilla Common Voice</a>).<br><br> | |
| Please select the model's tab above, enter any Frisian text (or choose from the examples for convenience), select a speaker ID below (for the multi-speaker model), and click on Submit to synthesize the speech.<br> | |
| </center> | |
| """ | |
| article = "<center>This is a basic demo version intended to accompany my PhD dissertation, showcasing the effects of several findings included in the research (please see the dissertation for details).<br> Due to hosting constraints, the model may be rather slow in synthesizing. Thank you for your patience!<br> Please reach out to <a href='https://www.rug.nl/staff/t.p.do/' target='_blank'>Phat Do</a> or email me at <a href='mailto:t.p.do@rug.nl' target='_blank'>t.p.do@rug.nl</a> if you are interested in knowing more!</center><br><br>To phonetically transcribe the input Frisian text, this work gratefully uses the G2P Frysk model and script kindly provided by the Fryske Akademy:<br> Heeringa, Wilbert & Drenth, Eduard & Van de Velde, Hans (2024). G2P Frysk [computer program]. Retrieved 4 July 2024 from <a href='https://www.fa.knaw.nl/fa-apps/graph2phon/' target='_blank'>https://www.fa.knaw.nl/fa-apps/graph2phon/</a>.<br><br>The Frisian training data is extracted from Mozilla Common Voice and the Dutch data from CSS10:<br>Ardila, R., Branson, M., Davis, K., Kohler, M., Meyer, J., Henretty, M., Morais, R., Saunders, L., Tyers, F., & Weber, G. (2020). <a href='https://commonvoice.mozilla.org/fy-NL' target='_blank'>Common Voice</a>: A Massively-Multilingual Speech Corpus. Proceedings of the 12th LREC, 4218–4222. <br>Park, K., & Mulc, T. (2019). CSS10: A collection of single speaker speech datasets for 10 languages. Proc. Interspeech 2019, 1566–1570. <a href='https://doi.org/10.21437/Interspeech.2019-1500' target='_blank'>https://doi.org/10.21437/Interspeech.2019-1500</a>." | |
| css = """ | |
| h1 { | |
| text-align: center; | |
| display:block; | |
| } | |
| """ | |
| def load_models(config_path, model_name, device = "cpu"): | |
| # getting configs | |
| preprocess_config = yaml.load(open(os.path.join(os.getcwd(), config_path + "preprocess.yaml"), "r"), Loader=yaml.FullLoader) | |
| model_config = yaml.load(open(os.path.join(os.getcwd(), config_path + "model.yaml"), "r"), Loader=yaml.FullLoader) | |
| train_config = yaml.load(open(os.path.join(os.getcwd(), config_path + "train.yaml"), "r"), Loader=yaml.FullLoader) | |
| configs = (preprocess_config, model_config, train_config) | |
| # loading models | |
| model = get_model_infer(os.path.join(os.getcwd(), "assets/model/" + model_name), configs, device) | |
| vocoder = get_vocoder(model_config, device) | |
| return configs, model, vocoder | |
| configs_single, model_single, vocoder_single = load_models(config_path = "config/Frysk_CV_speaker_29/", | |
| model_name = "Frysk_CV_speaker_29_350000.pth.tar") | |
| configs_multi, model_multi, vocoder_multi = load_models(config_path = "config/Frysk_CV/", | |
| model_name = "Frysk_CV_300000.pth.tar") | |
| def infer(text, speaker_ID, configs, model, vocoder): | |
| model.to('cuda') | |
| vocoder.to('cuda') | |
| speakers = np.array([int(speaker_ID)]) | |
| texts = np.array([synthesize.preprocess_frysk(text, configs[0])]) | |
| text_lens = np.array([len(texts[0])]) | |
| ids = raw_texts = [text[:100]] | |
| batchs = [(ids, raw_texts, speakers, texts, text_lens, max(text_lens))] | |
| synthesize.synthesize(model, "step", configs, vocoder, batchs, (1.0, 1.0, 1.0)) | |
| while True: | |
| if not os.path.exists(os.path.join(os.getcwd(), 'assets/audio/result.wav')): | |
| time.sleep(0.01) | |
| else: | |
| sr, audio = read(os.path.join(os.getcwd(), 'assets/audio/result.wav')) | |
| break | |
| return sr, audio | |
| # main inference function | |
| def run_single(text): | |
| speaker_ID = 0 | |
| output = infer(text, 0, configs_single, model_single, vocoder_single) | |
| return output | |
| def run_multi(text, speaker_ID): | |
| dic = {1: 0, 2: 11, 3: 22, 4: 24, 5: 25, 6: 26, 7: 27, 8: 28, 9: 29, 10: 1, 11: 2, 12: 3, 13: 4, 14: 5, 15: 6, 16: 7, 17: 8, 18: 9, 19: 10, 20: 12, 21: 13, 22: 14, 23: 15, 24: 16, 25: 17, 26: 18, 27: 19, 28: 20, 29: 21, 30: 23} | |
| output = infer(text, dic[speaker_ID], configs_multi, model_multi, vocoder_multi) | |
| return output | |
| iface_single = gr.Interface(fn=run_single, | |
| inputs=[ | |
| gr.components.Textbox(lines=3, placeholder="Please input Frisian text to synthesize", label='Text to synthesize'), | |
| ], | |
| outputs=gr.components.Audio(type="numpy", label='Synthesized speech'), | |
| title=title, | |
| description=description, | |
| article=article, | |
| css=css, | |
| theme='huggingface', | |
| examples=[ | |
| ["Praat mar Frysk!", 0], | |
| ["Bûter, brea en griene tsiis, wa’t dat net sizze kin, is gjin oprjochte Fries!", 0], | |
| ["In lyk man is in ryk man.", 0], | |
| ["As pake it net meitsje kin, dan slagget it gjinien!", 0], | |
| ["As it net kin sa’t it moat, dan moat it mar sa’t it kin.", 0], | |
| ["Elk sprekt fan myn sûpen, mar nimmen wit fan myn toarst!", 0], | |
| ["Pikerje net it komt dochs oars.", 0] | |
| ], | |
| ) | |
| iface_multi = gr.Interface(fn=run_multi, | |
| inputs=[ | |
| gr.components.Textbox(lines=3, placeholder="Please input Frisian text to synthesize", label='Text to synthesize'), | |
| gr.components.Slider(minimum=1, maximum=30, step=1, label="Speaker ID (1 to 30)") | |
| ], | |
| outputs=gr.components.Audio(type="numpy", label='Synthesized speech'), | |
| title=title, | |
| description=description, | |
| article=article, | |
| css=css, | |
| theme='huggingface', | |
| examples=[ | |
| ["Praat mar Frysk!", 1], | |
| ["Bûter, brea en griene tsiis, wa’t dat net sizze kin, is gjin oprjochte Fries!", 5], | |
| ["In lyk man is in ryk man.", 11], | |
| ["As pake it net meitsje kin, dan slagget it gjinien!", 17], | |
| ["As it net kin sa’t it moat, dan moat it mar sa’t it kin.", 20], | |
| ["Elk sprekt fan myn sûpen, mar nimmen wit fan myn toarst!", 25], | |
| ["Pikerje net it komt dochs oars.", 29] | |
| ], | |
| ) | |
| demo = gr.TabbedInterface([iface_single, iface_multi], ["Single-speaker (transfer learning)", "Multi-speaker (trained from scratch)"]) | |
| if __name__ == "__main__": | |
| demo.launch(share=False, inline=False, server_name="0.0.0.0", server_port=7860, allowed_paths=["/", "./assets"], ssr_mode=False) | |