Spaces:

phatdo
/

Frysk-TTS

Running on Zero

File size: 8,082 Bytes

import spaces

import gradio as gr
import numpy as np

import os, subprocess, time, torch, yaml, re
import synthesize

from pandas import describe_option
from scipy.io.wavfile import read
from utils.model import get_model_infer, get_vocoder

gr.close_all()

title = "Open-source and open-access Frisian TTS<br>(from Phat Do's PhD research)"
description = """
<center>
<img src='/gradio_api/file=assets/Friesland.png' width=300px><br>
These are the prototype Frisian synthetic speech models intended to showcase the findings of my PhD research, which are described in Section 7.6 of the dissertation.<br>
The single-speaker model was trained on only <strong>20 minutes</strong> of data (recordings) using transfer learning from Dutch data. The multi-speaker model was trained from scratch on <strong>32 hours</strong> of Frisian data (from <a href='https://commonvoice.mozilla.org/fy-NL' target='_blank'>Mozilla Common Voice</a>).<br><br>
Please select the model's tab above, enter any Frisian text (or choose from the examples for convenience), select a speaker ID below (for the multi-speaker model), and click on Submit to synthesize the speech.<br>
</center>
"""
article = "<center>This is a basic demo version intended to accompany my PhD dissertation, showcasing the effects of several findings included in the research (please see the dissertation for details).<br> Due to hosting constraints, the model may be rather slow in synthesizing. Thank you for your patience!<br> Please reach out to <a href='https://www.rug.nl/staff/t.p.do/' target='_blank'>Phat Do</a> or email me at <a href='mailto:t.p.do@rug.nl' target='_blank'>t.p.do@rug.nl</a> if you are interested in knowing more!</center><br><br>To phonetically transcribe the input Frisian text, this work gratefully uses the G2P Frysk model and script kindly provided by the Fryske Akademy:<br> Heeringa, Wilbert & Drenth, Eduard & Van de Velde, Hans (2024). G2P Frysk [computer program]. Retrieved 4 July 2024 from <a href='https://www.fa.knaw.nl/fa-apps/graph2phon/' target='_blank'>https://www.fa.knaw.nl/fa-apps/graph2phon/</a>.<br><br>The Frisian training data is extracted from Mozilla Common Voice and the Dutch data from CSS10:<br>Ardila, R., Branson, M., Davis, K., Kohler, M., Meyer, J., Henretty, M., Morais, R., Saunders, L., Tyers, F., & Weber, G. (2020). <a href='https://commonvoice.mozilla.org/fy-NL' target='_blank'>Common Voice</a>: A Massively-Multilingual Speech Corpus. Proceedings of the 12th LREC, 4218–4222. <br>Park, K., & Mulc, T. (2019). CSS10: A collection of single speaker speech datasets for 10 languages. Proc. Interspeech 2019, 1566–1570. <a href='https://doi.org/10.21437/Interspeech.2019-1500' target='_blank'>https://doi.org/10.21437/Interspeech.2019-1500</a>."

css = """
h1 {
    text-align: center;
    display:block;
}
"""

def load_models(config_path, model_name, device = "cpu"):
    # getting configs
    preprocess_config = yaml.load(open(os.path.join(os.getcwd(), config_path + "preprocess.yaml"), "r"), Loader=yaml.FullLoader)
    model_config = yaml.load(open(os.path.join(os.getcwd(), config_path + "model.yaml"), "r"), Loader=yaml.FullLoader)
    train_config = yaml.load(open(os.path.join(os.getcwd(), config_path + "train.yaml"), "r"), Loader=yaml.FullLoader)
    configs = (preprocess_config, model_config, train_config)
    # loading models
    model = get_model_infer(os.path.join(os.getcwd(), "assets/model/" + model_name), configs, device)
    vocoder = get_vocoder(model_config, device)    
    return configs, model, vocoder

configs_single, model_single, vocoder_single = load_models(config_path = "config/Frysk_CV_speaker_29/",
                                                           model_name = "Frysk_CV_speaker_29_350000.pth.tar")
configs_multi, model_multi, vocoder_multi = load_models(config_path = "config/Frysk_CV/",
                                                        model_name = "Frysk_CV_300000.pth.tar")

@spaces.GPU(duration=20)
def infer(text, speaker_ID, configs, model, vocoder):
    model.to('cuda')
    vocoder.to('cuda')
    speakers = np.array([int(speaker_ID)])
    texts = np.array([synthesize.preprocess_frysk(text, configs[0])])
    text_lens = np.array([len(texts[0])])
    
    ids = raw_texts = [text[:100]]
    batchs = [(ids, raw_texts, speakers, texts, text_lens, max(text_lens))]

    synthesize.synthesize(model, "step", configs, vocoder, batchs, (1.0, 1.0, 1.0))

    while True:
        if not os.path.exists(os.path.join(os.getcwd(), 'assets/audio/result.wav')):
            time.sleep(0.01)
        else:
            sr, audio = read(os.path.join(os.getcwd(), 'assets/audio/result.wav'))
            break    
    return sr, audio

# main inference function
@spaces.GPU(duration=20)
def run_single(text):
    speaker_ID = 0
    output = infer(text, 0, configs_single, model_single, vocoder_single)
    return output

@spaces.GPU(duration=20)
def run_multi(text, speaker_ID):
    dic = {1: 0, 2: 11, 3: 22, 4: 24, 5: 25, 6: 26, 7: 27, 8: 28, 9: 29, 10: 1, 11: 2, 12: 3, 13: 4, 14: 5, 15: 6, 16: 7, 17: 8, 18: 9, 19: 10, 20: 12, 21: 13, 22: 14, 23: 15, 24: 16, 25: 17, 26: 18, 27: 19, 28: 20, 29: 21, 30: 23}
    output = infer(text, dic[speaker_ID], configs_multi, model_multi, vocoder_multi)
    return output

iface_single = gr.Interface(fn=run_single,
                     inputs=[
                            gr.components.Textbox(lines=3, placeholder="Please input Frisian text to synthesize", label='Text to synthesize'),
                            ],
                     outputs=gr.components.Audio(type="numpy", label='Synthesized speech'),
                     title=title,
                     description=description,
                     article=article,
                     css=css,
                     theme='huggingface',
                     examples=[
                            ["Praat mar Frysk!", 0],
                            ["Bûter, brea en griene tsiis, wa’t dat net sizze kin, is gjin oprjochte Fries!", 0],
                            ["In lyk man is in ryk man.", 0],
                            ["As pake it net meitsje kin, dan slagget it gjinien!", 0],
                            ["As it net kin sa’t it moat, dan moat it mar sa’t it kin.", 0],
                            ["Elk sprekt fan myn sûpen, mar nimmen wit fan myn toarst!", 0],
                            ["Pikerje net it komt dochs oars.", 0]
                              ],
                     )

iface_multi = gr.Interface(fn=run_multi,
                     inputs=[
                            gr.components.Textbox(lines=3, placeholder="Please input Frisian text to synthesize", label='Text to synthesize'),
                            gr.components.Slider(minimum=1, maximum=30, step=1, label="Speaker ID (1 to 30)") 
                            ],
                     outputs=gr.components.Audio(type="numpy", label='Synthesized speech'),
                     title=title,
                     description=description,
                     article=article,
                     css=css,
                     theme='huggingface',
                     examples=[
                            ["Praat mar Frysk!", 1],
                            ["Bûter, brea en griene tsiis, wa’t dat net sizze kin, is gjin oprjochte Fries!", 5],
                            ["In lyk man is in ryk man.", 11],
                            ["As pake it net meitsje kin, dan slagget it gjinien!", 17],
                            ["As it net kin sa’t it moat, dan moat it mar sa’t it kin.", 20],
                            ["Elk sprekt fan myn sûpen, mar nimmen wit fan myn toarst!", 25],
                            ["Pikerje net it komt dochs oars.", 29]
                              ],
                     )

demo = gr.TabbedInterface([iface_single, iface_multi], ["Single-speaker (transfer learning)", "Multi-speaker (trained from scratch)"])

if __name__ == "__main__":
    demo.launch(share=False, inline=False, server_name="0.0.0.0", server_port=7860, allowed_paths=["/", "./assets"], ssr_mode=False)