Spaces:
Running
on
Zero
Running
on
Zero
File size: 8,082 Bytes
5036fe3 324a90a 32d5d94 324a90a 0fba8ff 324a90a 0fba8ff 324a90a 2173f6e 5ba2040 324a90a 288d3b7 324a90a 5ba2040 324a90a 288d3b7 324a90a 288d3b7 324a90a 2173f6e 324a90a 0f06266 324a90a 0f06266 324a90a 2173f6e 324a90a 4601f8c 324a90a 4601f8c 324a90a c5ac27d |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
import spaces
import gradio as gr
import numpy as np
import os, subprocess, time, torch, yaml, re
import synthesize
from pandas import describe_option
from scipy.io.wavfile import read
from utils.model import get_model_infer, get_vocoder
gr.close_all()
title = "Open-source and open-access Frisian TTS<br>(from Phat Do's PhD research)"
description = """
<center>
<img src='/gradio_api/file=assets/Friesland.png' width=300px><br>
These are the prototype Frisian synthetic speech models intended to showcase the findings of my PhD research, which are described in Section 7.6 of the dissertation.<br>
The single-speaker model was trained on only <strong>20 minutes</strong> of data (recordings) using transfer learning from Dutch data. The multi-speaker model was trained from scratch on <strong>32 hours</strong> of Frisian data (from <a href='https://commonvoice.mozilla.org/fy-NL' target='_blank'>Mozilla Common Voice</a>).<br><br>
Please select the model's tab above, enter any Frisian text (or choose from the examples for convenience), select a speaker ID below (for the multi-speaker model), and click on Submit to synthesize the speech.<br>
</center>
"""
article = "<center>This is a basic demo version intended to accompany my PhD dissertation, showcasing the effects of several findings included in the research (please see the dissertation for details).<br> Due to hosting constraints, the model may be rather slow in synthesizing. Thank you for your patience!<br> Please reach out to <a href='https://www.rug.nl/staff/t.p.do/' target='_blank'>Phat Do</a> or email me at <a href='mailto:t.p.do@rug.nl' target='_blank'>t.p.do@rug.nl</a> if you are interested in knowing more!</center><br><br>To phonetically transcribe the input Frisian text, this work gratefully uses the G2P Frysk model and script kindly provided by the Fryske Akademy:<br> Heeringa, Wilbert & Drenth, Eduard & Van de Velde, Hans (2024). G2P Frysk [computer program]. Retrieved 4 July 2024 from <a href='https://www.fa.knaw.nl/fa-apps/graph2phon/' target='_blank'>https://www.fa.knaw.nl/fa-apps/graph2phon/</a>.<br><br>The Frisian training data is extracted from Mozilla Common Voice and the Dutch data from CSS10:<br>Ardila, R., Branson, M., Davis, K., Kohler, M., Meyer, J., Henretty, M., Morais, R., Saunders, L., Tyers, F., & Weber, G. (2020). <a href='https://commonvoice.mozilla.org/fy-NL' target='_blank'>Common Voice</a>: A Massively-Multilingual Speech Corpus. Proceedings of the 12th LREC, 4218–4222. <br>Park, K., & Mulc, T. (2019). CSS10: A collection of single speaker speech datasets for 10 languages. Proc. Interspeech 2019, 1566–1570. <a href='https://doi.org/10.21437/Interspeech.2019-1500' target='_blank'>https://doi.org/10.21437/Interspeech.2019-1500</a>."
css = """
h1 {
text-align: center;
display:block;
}
"""
def load_models(config_path, model_name, device = "cpu"):
# getting configs
preprocess_config = yaml.load(open(os.path.join(os.getcwd(), config_path + "preprocess.yaml"), "r"), Loader=yaml.FullLoader)
model_config = yaml.load(open(os.path.join(os.getcwd(), config_path + "model.yaml"), "r"), Loader=yaml.FullLoader)
train_config = yaml.load(open(os.path.join(os.getcwd(), config_path + "train.yaml"), "r"), Loader=yaml.FullLoader)
configs = (preprocess_config, model_config, train_config)
# loading models
model = get_model_infer(os.path.join(os.getcwd(), "assets/model/" + model_name), configs, device)
vocoder = get_vocoder(model_config, device)
return configs, model, vocoder
configs_single, model_single, vocoder_single = load_models(config_path = "config/Frysk_CV_speaker_29/",
model_name = "Frysk_CV_speaker_29_350000.pth.tar")
configs_multi, model_multi, vocoder_multi = load_models(config_path = "config/Frysk_CV/",
model_name = "Frysk_CV_300000.pth.tar")
@spaces.GPU(duration=20)
def infer(text, speaker_ID, configs, model, vocoder):
model.to('cuda')
vocoder.to('cuda')
speakers = np.array([int(speaker_ID)])
texts = np.array([synthesize.preprocess_frysk(text, configs[0])])
text_lens = np.array([len(texts[0])])
ids = raw_texts = [text[:100]]
batchs = [(ids, raw_texts, speakers, texts, text_lens, max(text_lens))]
synthesize.synthesize(model, "step", configs, vocoder, batchs, (1.0, 1.0, 1.0))
while True:
if not os.path.exists(os.path.join(os.getcwd(), 'assets/audio/result.wav')):
time.sleep(0.01)
else:
sr, audio = read(os.path.join(os.getcwd(), 'assets/audio/result.wav'))
break
return sr, audio
# main inference function
@spaces.GPU(duration=20)
def run_single(text):
speaker_ID = 0
output = infer(text, 0, configs_single, model_single, vocoder_single)
return output
@spaces.GPU(duration=20)
def run_multi(text, speaker_ID):
dic = {1: 0, 2: 11, 3: 22, 4: 24, 5: 25, 6: 26, 7: 27, 8: 28, 9: 29, 10: 1, 11: 2, 12: 3, 13: 4, 14: 5, 15: 6, 16: 7, 17: 8, 18: 9, 19: 10, 20: 12, 21: 13, 22: 14, 23: 15, 24: 16, 25: 17, 26: 18, 27: 19, 28: 20, 29: 21, 30: 23}
output = infer(text, dic[speaker_ID], configs_multi, model_multi, vocoder_multi)
return output
iface_single = gr.Interface(fn=run_single,
inputs=[
gr.components.Textbox(lines=3, placeholder="Please input Frisian text to synthesize", label='Text to synthesize'),
],
outputs=gr.components.Audio(type="numpy", label='Synthesized speech'),
title=title,
description=description,
article=article,
css=css,
theme='huggingface',
examples=[
["Praat mar Frysk!", 0],
["Bûter, brea en griene tsiis, wa’t dat net sizze kin, is gjin oprjochte Fries!", 0],
["In lyk man is in ryk man.", 0],
["As pake it net meitsje kin, dan slagget it gjinien!", 0],
["As it net kin sa’t it moat, dan moat it mar sa’t it kin.", 0],
["Elk sprekt fan myn sûpen, mar nimmen wit fan myn toarst!", 0],
["Pikerje net it komt dochs oars.", 0]
],
)
iface_multi = gr.Interface(fn=run_multi,
inputs=[
gr.components.Textbox(lines=3, placeholder="Please input Frisian text to synthesize", label='Text to synthesize'),
gr.components.Slider(minimum=1, maximum=30, step=1, label="Speaker ID (1 to 30)")
],
outputs=gr.components.Audio(type="numpy", label='Synthesized speech'),
title=title,
description=description,
article=article,
css=css,
theme='huggingface',
examples=[
["Praat mar Frysk!", 1],
["Bûter, brea en griene tsiis, wa’t dat net sizze kin, is gjin oprjochte Fries!", 5],
["In lyk man is in ryk man.", 11],
["As pake it net meitsje kin, dan slagget it gjinien!", 17],
["As it net kin sa’t it moat, dan moat it mar sa’t it kin.", 20],
["Elk sprekt fan myn sûpen, mar nimmen wit fan myn toarst!", 25],
["Pikerje net it komt dochs oars.", 29]
],
)
demo = gr.TabbedInterface([iface_single, iface_multi], ["Single-speaker (transfer learning)", "Multi-speaker (trained from scratch)"])
if __name__ == "__main__":
demo.launch(share=False, inline=False, server_name="0.0.0.0", server_port=7860, allowed_paths=["/", "./assets"], ssr_mode=False)
|