Spaces:
Runtime error
Runtime error
| # Copyright (C) 2021. Huawei Technologies Co., Ltd. All rights reserved. | |
| # This program is free software; you can redistribute it and/or modify | |
| # it under the terms of the MIT License. | |
| # This program is distributed in the hope that it will be useful, | |
| # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
| # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
| # MIT License for more details. | |
| import argparse | |
| import json | |
| import datetime as dt | |
| import numpy as np | |
| from scipy.io.wavfile import write | |
| import torch | |
| import params | |
| from model import GradTTS | |
| from text import text_to_sequence, cmudict | |
| from text.symbols import symbols | |
| from utils import intersperse | |
| import sys | |
| sys.path.append('./hifi-gan/') | |
| from env import AttrDict | |
| from models import Generator as HiFiGAN | |
| HIFIGAN_CONFIG = './checkpts/hifigan-config.json' | |
| HIFIGAN_CHECKPT = './checkpts/hifigan.pt' | |
| if __name__ == '__main__': | |
| parser = argparse.ArgumentParser() | |
| parser.add_argument('-f', '--file', type=str, required=True, help='path to a file with texts to synthesize') | |
| parser.add_argument('-c', '--checkpoint', type=str, required=True, help='path to a checkpoint of Grad-TTS') | |
| parser.add_argument('-t', '--timesteps', type=int, required=False, default=10, help='number of timesteps of reverse diffusion') | |
| parser.add_argument('-s', '--speaker_id', type=int, required=False, default=None, help='speaker id for multispeaker model') | |
| args = parser.parse_args() | |
| if not isinstance(args.speaker_id, type(None)): | |
| assert params.n_spks > 1, "Ensure you set right number of speakers in `params.py`." | |
| spk = torch.LongTensor([args.speaker_id]).cuda() | |
| else: | |
| spk = None | |
| print('Initializing Grad-TTS...') | |
| generator = GradTTS(len(symbols)+1, params.n_spks, params.spk_emb_dim, | |
| params.n_enc_channels, params.filter_channels, | |
| params.filter_channels_dp, params.n_heads, params.n_enc_layers, | |
| params.enc_kernel, params.enc_dropout, params.window_size, | |
| params.n_feats, params.dec_dim, params.beta_min, params.beta_max, params.pe_scale) | |
| generator.load_state_dict(torch.load(args.checkpoint, map_location=lambda loc, storage: loc)) | |
| _ = generator.cuda().eval() | |
| print(f'Number of parameters: {generator.nparams}') | |
| print('Initializing HiFi-GAN...') | |
| with open(HIFIGAN_CONFIG) as f: | |
| h = AttrDict(json.load(f)) | |
| vocoder = HiFiGAN(h) | |
| vocoder.load_state_dict(torch.load(HIFIGAN_CHECKPT, map_location=lambda loc, storage: loc)['generator']) | |
| _ = vocoder.cuda().eval() | |
| vocoder.remove_weight_norm() | |
| with open(args.file, 'r', encoding='utf-8') as f: | |
| texts = [line.strip() for line in f.readlines()] | |
| cmu = cmudict.CMUDict('./resources/cmu_dictionary') | |
| with torch.no_grad(): | |
| for i, text in enumerate(texts): | |
| print(f'Synthesizing {i} text...', end=' ') | |
| x = torch.LongTensor(intersperse(text_to_sequence(text, dictionary=cmu), len(symbols))).cuda()[None] | |
| x_lengths = torch.LongTensor([x.shape[-1]]).cuda() | |
| t = dt.datetime.now() | |
| y_enc, y_dec, attn = generator.forward(x, x_lengths, n_timesteps=args.timesteps, temperature=1.5, | |
| stoc=False, spk=spk, length_scale=0.91) | |
| t = (dt.datetime.now() - t).total_seconds() | |
| print(f'Grad-TTS RTF: {t * 22050 / (y_dec.shape[-1] * 256)}') | |
| audio = (vocoder.forward(y_dec).cpu().squeeze().clamp(-1, 1).numpy() * 32768).astype(np.int16) | |
| write(f'./out/sample_{i}.wav', 22050, audio) | |
| print('Done. Check out `out` folder for samples.') | |