RNNoise / models /WaveRNNModel /quick_start.py

RNNoise (models)

2e62044 verified 2 months ago

4.83 kB

	import torch
	from models.fatchord_version import WaveRNN
	from utils import hparams as hp
	from utils.text.symbols import symbols
	from models.tacotron import Tacotron
	import argparse
	from utils.text import text_to_sequence
	from utils.display import save_attention, simple_table
	import zipfile, os


	os.makedirs('quick_start/tts_weights/', exist_ok=True)
	os.makedirs('quick_start/voc_weights/', exist_ok=True)

	zip_ref = zipfile.ZipFile('pretrained/ljspeech.wavernn.mol.800k.zip', 'r')
	zip_ref.extractall('quick_start/voc_weights/')
	zip_ref.close()

	zip_ref = zipfile.ZipFile('pretrained/ljspeech.tacotron.r2.180k.zip', 'r')
	zip_ref.extractall('quick_start/tts_weights/')
	zip_ref.close()


	if __name__ == "__main__":

	# Parse Arguments
	parser = argparse.ArgumentParser(description='TTS Generator')
	parser.add_argument('--input_text', '-i', type=str, help='[string] Type in something here and TTS will generate it!')
	parser.add_argument('--batched', '-b', dest='batched', action='store_true', help='Fast Batched Generation (lower quality)')
	parser.add_argument('--unbatched', '-u', dest='batched', action='store_false', help='Slower Unbatched Generation (better quality)')
	parser.add_argument('--force_cpu', '-c', action='store_true', help='Forces CPU-only training, even when in CUDA capable environment')
	parser.add_argument('--hp_file', metavar='FILE', default='hparams.py',
	help='The file to use for the hyperparameters')
	args = parser.parse_args()

	hp.configure(args.hp_file) # Load hparams from file

	parser.set_defaults(batched=True)
	parser.set_defaults(input_text=None)

	batched = args.batched
	input_text = args.input_text

	if not args.force_cpu and torch.cuda.is_available():
	device = torch.device('cuda')
	else:
	device = torch.device('cpu')
	print('Using device:', device)

	print('\nInitialising WaveRNN Model...\n')

	# Instantiate WaveRNN Model
	voc_model = WaveRNN(rnn_dims=hp.voc_rnn_dims,
	fc_dims=hp.voc_fc_dims,
	bits=hp.bits,
	pad=hp.voc_pad,
	upsample_factors=hp.voc_upsample_factors,
	feat_dims=hp.num_mels,
	compute_dims=hp.voc_compute_dims,
	res_out_dims=hp.voc_res_out_dims,
	res_blocks=hp.voc_res_blocks,
	hop_length=hp.hop_length,
	sample_rate=hp.sample_rate,
	mode='MOL').to(device)

	voc_model.load('quick_start/voc_weights/latest_weights.pyt')

	print('\nInitialising Tacotron Model...\n')

	# Instantiate Tacotron Model
	tts_model = Tacotron(embed_dims=hp.tts_embed_dims,
	num_chars=len(symbols),
	encoder_dims=hp.tts_encoder_dims,
	decoder_dims=hp.tts_decoder_dims,
	n_mels=hp.num_mels,
	fft_bins=hp.num_mels,
	postnet_dims=hp.tts_postnet_dims,
	encoder_K=hp.tts_encoder_K,
	lstm_dims=hp.tts_lstm_dims,
	postnet_K=hp.tts_postnet_K,
	num_highways=hp.tts_num_highways,
	dropout=hp.tts_dropout,
	stop_threshold=hp.tts_stop_threshold).to(device)


	tts_model.load('quick_start/tts_weights/latest_weights.pyt')

	if input_text:
	inputs = [text_to_sequence(input_text.strip(), hp.tts_cleaner_names)]
	else:
	with open('sentences.txt') as f:
	inputs = [text_to_sequence(l.strip(), hp.tts_cleaner_names) for l in f]

	voc_k = voc_model.get_step() // 1000
	tts_k = tts_model.get_step() // 1000

	r = tts_model.r

	simple_table([('WaveRNN', str(voc_k) + 'k'),
	(f'Tacotron(r={r})', str(tts_k) + 'k'),
	('Generation Mode', 'Batched' if batched else 'Unbatched'),
	('Target Samples', 11_000 if batched else 'N/A'),
	('Overlap Samples', 550 if batched else 'N/A')])

	for i, x in enumerate(inputs, 1):

	print(f'\n\| Generating {i}/{len(inputs)}')
	_, m, attention = tts_model.generate(x)

	if input_text:
	save_path = f'quick_start/__input_{input_text[:10]}_{tts_k}k.wav'
	else:
	save_path = f'quick_start/{i}_batched{str(batched)}_{tts_k}k.wav'

	# save_attention(attention, save_path)

	m = torch.tensor(m).unsqueeze(0)
	m = (m + 4) / 8

	voc_model.generate(m, save_path, batched, 11_000, 550, hp.mu_law)

	print('\n\nDone.\n')