File size: 8,082 Bytes
5036fe3
 
324a90a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32d5d94
324a90a
0fba8ff
324a90a
 
 
0fba8ff
324a90a
2173f6e
 
 
 
 
 
 
5ba2040
324a90a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
288d3b7
324a90a
5ba2040
 
324a90a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
288d3b7
324a90a
 
 
 
 
288d3b7
324a90a
 
 
 
 
 
 
 
 
 
 
 
 
2173f6e
324a90a
 
 
0f06266
324a90a
 
0f06266
324a90a
 
 
 
 
 
 
 
 
 
 
 
 
 
2173f6e
324a90a
 
 
4601f8c
324a90a
 
4601f8c
324a90a
 
 
 
 
 
 
 
c5ac27d
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import spaces

import gradio as gr
import numpy as np

import os, subprocess, time, torch, yaml, re
import synthesize

from pandas import describe_option
from scipy.io.wavfile import read
from utils.model import get_model_infer, get_vocoder

gr.close_all()

title = "Open-source and open-access Frisian TTS<br>(from Phat Do's PhD research)"
description = """
<center>
<img src='/gradio_api/file=assets/Friesland.png' width=300px><br>
These are the prototype Frisian synthetic speech models intended to showcase the findings of my PhD research, which are described in Section 7.6 of the dissertation.<br>
The single-speaker model was trained on only <strong>20 minutes</strong> of data (recordings) using transfer learning from Dutch data. The multi-speaker model was trained from scratch on <strong>32 hours</strong> of Frisian data (from <a href='https://commonvoice.mozilla.org/fy-NL' target='_blank'>Mozilla Common Voice</a>).<br><br>
Please select the model's tab above, enter any Frisian text (or choose from the examples for convenience), select a speaker ID below (for the multi-speaker model), and click on Submit to synthesize the speech.<br>
</center>
"""
article = "<center>This is a basic demo version intended to accompany my PhD dissertation, showcasing the effects of several findings included in the research (please see the dissertation for details).<br> Due to hosting constraints, the model may be rather slow in synthesizing. Thank you for your patience!<br> Please reach out to <a href='https://www.rug.nl/staff/t.p.do/' target='_blank'>Phat Do</a> or email me at <a href='mailto:t.p.do@rug.nl' target='_blank'>t.p.do@rug.nl</a> if you are interested in knowing more!</center><br><br>To phonetically transcribe the input Frisian text, this work gratefully uses the G2P Frysk model and script kindly provided by the Fryske Akademy:<br> Heeringa, Wilbert & Drenth, Eduard & Van de Velde, Hans (2024). G2P Frysk [computer program]. Retrieved 4 July 2024 from <a href='https://www.fa.knaw.nl/fa-apps/graph2phon/' target='_blank'>https://www.fa.knaw.nl/fa-apps/graph2phon/</a>.<br><br>The Frisian training data is extracted from Mozilla Common Voice and the Dutch data from CSS10:<br>Ardila, R., Branson, M., Davis, K., Kohler, M., Meyer, J., Henretty, M., Morais, R., Saunders, L., Tyers, F., & Weber, G. (2020). <a href='https://commonvoice.mozilla.org/fy-NL' target='_blank'>Common Voice</a>: A Massively-Multilingual Speech Corpus. Proceedings of the 12th LREC, 4218–4222. <br>Park, K., & Mulc, T. (2019). CSS10: A collection of single speaker speech datasets for 10 languages. Proc. Interspeech 2019, 1566–1570. <a href='https://doi.org/10.21437/Interspeech.2019-1500' target='_blank'>https://doi.org/10.21437/Interspeech.2019-1500</a>."

css = """
h1 {
    text-align: center;
    display:block;
}
"""

def load_models(config_path, model_name, device = "cpu"):
    # getting configs
    preprocess_config = yaml.load(open(os.path.join(os.getcwd(), config_path + "preprocess.yaml"), "r"), Loader=yaml.FullLoader)
    model_config = yaml.load(open(os.path.join(os.getcwd(), config_path + "model.yaml"), "r"), Loader=yaml.FullLoader)
    train_config = yaml.load(open(os.path.join(os.getcwd(), config_path + "train.yaml"), "r"), Loader=yaml.FullLoader)
    configs = (preprocess_config, model_config, train_config)
    # loading models
    model = get_model_infer(os.path.join(os.getcwd(), "assets/model/" + model_name), configs, device)
    vocoder = get_vocoder(model_config, device)    
    return configs, model, vocoder

configs_single, model_single, vocoder_single = load_models(config_path = "config/Frysk_CV_speaker_29/",
                                                           model_name = "Frysk_CV_speaker_29_350000.pth.tar")
configs_multi, model_multi, vocoder_multi = load_models(config_path = "config/Frysk_CV/",
                                                        model_name = "Frysk_CV_300000.pth.tar")

@spaces.GPU(duration=20)
def infer(text, speaker_ID, configs, model, vocoder):
    model.to('cuda')
    vocoder.to('cuda')
    speakers = np.array([int(speaker_ID)])
    texts = np.array([synthesize.preprocess_frysk(text, configs[0])])
    text_lens = np.array([len(texts[0])])
    
    ids = raw_texts = [text[:100]]
    batchs = [(ids, raw_texts, speakers, texts, text_lens, max(text_lens))]

    synthesize.synthesize(model, "step", configs, vocoder, batchs, (1.0, 1.0, 1.0))

    while True:
        if not os.path.exists(os.path.join(os.getcwd(), 'assets/audio/result.wav')):
            time.sleep(0.01)
        else:
            sr, audio = read(os.path.join(os.getcwd(), 'assets/audio/result.wav'))
            break    
    return sr, audio

# main inference function
@spaces.GPU(duration=20)
def run_single(text):
    speaker_ID = 0
    output = infer(text, 0, configs_single, model_single, vocoder_single)
    return output

@spaces.GPU(duration=20)
def run_multi(text, speaker_ID):
    dic = {1: 0, 2: 11, 3: 22, 4: 24, 5: 25, 6: 26, 7: 27, 8: 28, 9: 29, 10: 1, 11: 2, 12: 3, 13: 4, 14: 5, 15: 6, 16: 7, 17: 8, 18: 9, 19: 10, 20: 12, 21: 13, 22: 14, 23: 15, 24: 16, 25: 17, 26: 18, 27: 19, 28: 20, 29: 21, 30: 23}
    output = infer(text, dic[speaker_ID], configs_multi, model_multi, vocoder_multi)
    return output

iface_single = gr.Interface(fn=run_single,
                     inputs=[
                            gr.components.Textbox(lines=3, placeholder="Please input Frisian text to synthesize", label='Text to synthesize'),
                            ],
                     outputs=gr.components.Audio(type="numpy", label='Synthesized speech'),
                     title=title,
                     description=description,
                     article=article,
                     css=css,
                     theme='huggingface',
                     examples=[
                            ["Praat mar Frysk!", 0],
                            ["Bûter, brea en griene tsiis, wa’t dat net sizze kin, is gjin oprjochte Fries!", 0],
                            ["In lyk man is in ryk man.", 0],
                            ["As pake it net meitsje kin, dan slagget it gjinien!", 0],
                            ["As it net kin sa’t it moat, dan moat it mar sa’t it kin.", 0],
                            ["Elk sprekt fan myn sûpen, mar nimmen wit fan myn toarst!", 0],
                            ["Pikerje net it komt dochs oars.", 0]
                              ],
                     )

iface_multi = gr.Interface(fn=run_multi,
                     inputs=[
                            gr.components.Textbox(lines=3, placeholder="Please input Frisian text to synthesize", label='Text to synthesize'),
                            gr.components.Slider(minimum=1, maximum=30, step=1, label="Speaker ID (1 to 30)") 
                            ],
                     outputs=gr.components.Audio(type="numpy", label='Synthesized speech'),
                     title=title,
                     description=description,
                     article=article,
                     css=css,
                     theme='huggingface',
                     examples=[
                            ["Praat mar Frysk!", 1],
                            ["Bûter, brea en griene tsiis, wa’t dat net sizze kin, is gjin oprjochte Fries!", 5],
                            ["In lyk man is in ryk man.", 11],
                            ["As pake it net meitsje kin, dan slagget it gjinien!", 17],
                            ["As it net kin sa’t it moat, dan moat it mar sa’t it kin.", 20],
                            ["Elk sprekt fan myn sûpen, mar nimmen wit fan myn toarst!", 25],
                            ["Pikerje net it komt dochs oars.", 29]
                              ],
                     )

demo = gr.TabbedInterface([iface_single, iface_multi], ["Single-speaker (transfer learning)", "Multi-speaker (trained from scratch)"])

if __name__ == "__main__":
    demo.launch(share=False, inline=False, server_name="0.0.0.0", server_port=7860, allowed_paths=["/", "./assets"], ssr_mode=False)