| | |
| | import torch |
| | import os |
| |
|
| | import commons |
| | import utils |
| | from models import SynthesizerTrn |
| | from text.symbols import symbols |
| | from text import text_to_sequence |
| |
|
| | from scipy.io.wavfile import write |
| | import gradio as gr |
| |
|
| | print("Running GRadio", gr.__version__) |
| |
|
| | model_path = "vits2_pytorch/G_390000.pth" |
| | config_path = "vits2_pytorch/vits2_vctk_cat_inference.json" |
| |
|
| | hps = utils.get_hparams_from_file(config_path) |
| |
|
| | if ( |
| | "use_mel_posterior_encoder" in hps.model.keys() |
| | and hps.model.use_mel_posterior_encoder == True |
| | ): |
| | print("Using mel posterior encoder for VITS2") |
| | posterior_channels = 80 |
| | hps.data.use_mel_posterior_encoder = True |
| | else: |
| | print("Using lin posterior encoder for VITS1") |
| | posterior_channels = hps.data.filter_length // 2 + 1 |
| | hps.data.use_mel_posterior_encoder = False |
| |
|
| | net_g = SynthesizerTrn( |
| | len(symbols), |
| | posterior_channels, |
| | hps.train.segment_size // hps.data.hop_length, |
| | n_speakers=hps.data.n_speakers, |
| | **hps.model |
| | ) |
| | _ = net_g.eval() |
| |
|
| | _ = utils.load_checkpoint(model_path, net_g, None) |
| |
|
| |
|
| | def get_text(text, hps): |
| | text_norm = text_to_sequence(text, hps.data.text_cleaners) |
| | |
| |
|
| | if hps.data.add_blank: |
| | text_norm = commons.intersperse(text_norm, 0) |
| | text_norm = torch.LongTensor(text_norm) |
| | return text_norm |
| |
|
| | def tts(text:str, speaker_id:int, speed:float, noise_scale:float=0.667, noise_scale_w:float=0.8): |
| |
|
| | stn_tst = get_text(text, hps) |
| | with torch.no_grad(): |
| | x_tst = stn_tst.unsqueeze(0) |
| | x_tst_lengths = torch.LongTensor([stn_tst.size(0)]) |
| | sid = torch.LongTensor([speaker_id]) |
| | waveform = ( |
| | net_g.infer( |
| | x_tst, |
| | x_tst_lengths, |
| | sid=sid, |
| | noise_scale=noise_scale, |
| | noise_scale_w=noise_scale_w, |
| | length_scale=1/speed, |
| | )[0][0, 0] |
| | .data.cpu() |
| | .float() |
| | .numpy() |
| | ) |
| | |
| | return gr.make_waveform((22050, waveform)) |
| |
|
| | |
| |
|
| | title = """ |
| | <div style="text-align: center; max-width: 700px; margin: 0 auto;"> |
| | <div |
| | style="display: inline-flex; align-items: center; gap: 0.8rem; font-size: 1.75rem;" |
| | > <h1 style="font-weight: 900; margin-bottom: 7px; line-height: normal;"> |
| | VITS2 TTS Catalan Demo |
| | </h1> </div> |
| | </div> |
| | """ |
| |
|
| | description = """ |
| | VITS2 is an end-to-end speech synthesis model that predicts a speech waveform conditional on an input text sequence. VITS2 improved the |
| | training and inference efficiency and naturalness by introducing adversarial learning into the duration predictor. The transformer |
| | block was added to the normalizing flows to capture the long-term dependency when transforming the distribution. |
| | The synthesis quality was improved by incorporating Gaussian noise into the alignment search. |
| | |
| | This model is being trained in openslr69 and festcat datasets |
| | """ |
| |
|
| | article = "Model by Jungil Kong, et al. from SK telecom. Demo by BSC." |
| |
|
| | vits2_inference = gr.Interface( |
| | fn=tts, |
| | inputs=[ |
| | gr.Textbox( |
| | value="m'ha costat desenvolupar molt una veu, i ara que la tinc no estaré en silenci.", |
| | max_lines=1, |
| | label="Input text", |
| | ), |
| | gr.Slider( |
| | 1, |
| | 47, |
| | value=10, |
| | step=1, |
| | label="Speaker id", |
| | info=f"This model is trained on 47 speakers. You can prompt the model using one of these speaker ids.", |
| | ), |
| | gr.Slider( |
| | 0.5, |
| | 1.5, |
| | value=1, |
| | step=0.1, |
| | label="Speed", |
| | ), |
| | gr.Slider( |
| | 0.2, |
| | 2.0, |
| | value=0.667, |
| | step=0.01, |
| | label="Noise scale", |
| | ), |
| | gr.Slider( |
| | 0.2, |
| | 2.0, |
| | value=0.8, |
| | step=0.01, |
| | label="Noise scale w", |
| | ), |
| | ], |
| | outputs=gr.Audio(), |
| | ) |
| |
|
| | demo = gr.Blocks() |
| |
|
| | with demo: |
| | gr.Markdown(title) |
| | gr.Markdown(description) |
| | gr.TabbedInterface([vits2_inference], ["Multispeaker"]) |
| | gr.Markdown(article) |
| |
|
| | demo.queue(max_size=10) |
| | demo.launch(show_api=False, server_name="0.0.0.0", server_port=7860,ssr_mode=False) |