Spaces:

Marne
/

MockingBird

Runtime error

MockingBird / mockingbirdforuse /synthesizer /inference.py

AkashiCoin

Add dir MockingBirdForUse

8966d94 over 3 years ago

5.43 kB

	import torch
	import librosa
	import numpy as np
	from pathlib import Path
	from typing import Union, List
	from pypinyin import lazy_pinyin, Style

	from .hparams import hparams as hp
	from .utils.symbols import symbols
	from .models.tacotron import Tacotron
	from .utils.text import text_to_sequence
	from .utils.logmmse import denoise, profile_noise
	from ..log import logger


	class Synthesizer:
	def __init__(self, model_path: Path):
	# Check for GPU
	if torch.cuda.is_available():
	self.device = torch.device("cuda")
	else:
	self.device = torch.device("cpu")
	logger.info(f"Synthesizer using device: {self.device}")

	self._model = Tacotron(
	embed_dims=hp.tts_embed_dims,
	num_chars=len(symbols),
	encoder_dims=hp.tts_encoder_dims,
	decoder_dims=hp.tts_decoder_dims,
	n_mels=hp.num_mels,
	fft_bins=hp.num_mels,
	postnet_dims=hp.tts_postnet_dims,
	encoder_K=hp.tts_encoder_K,
	lstm_dims=hp.tts_lstm_dims,
	postnet_K=hp.tts_postnet_K,
	num_highways=hp.tts_num_highways,
	dropout=hp.tts_dropout,
	stop_threshold=hp.tts_stop_threshold,
	speaker_embedding_size=hp.speaker_embedding_size,
	).to(self.device)

	self._model.load(model_path, self.device)
	self._model.eval()

	logger.info(
	'Loaded synthesizer "%s" trained to step %d'
	% (model_path.name, self._model.state_dict()["step"])
	)

	def synthesize_spectrograms(
	self,
	texts: List[str],
	embeddings: Union[np.ndarray, List[np.ndarray]],
	return_alignments=False,
	style_idx=0,
	min_stop_token=5,
	steps=2000,
	):
	"""
	Synthesizes mel spectrograms from texts and speaker embeddings.

	:param texts: a list of N text prompts to be synthesized
	:param embeddings: a numpy array or list of speaker embeddings of shape (N, 256)
	:param return_alignments: if True, a matrix representing the alignments between the
	characters
	and each decoder output step will be returned for each spectrogram
	:return: a list of N melspectrograms as numpy arrays of shape (80, Mi), where Mi is the
	sequence length of spectrogram i, and possibly the alignments.
	"""

	logger.debug("Read " + str(texts))
	texts = [
	" ".join(lazy_pinyin(v, style=Style.TONE3, neutral_tone_with_five=True))
	for v in texts
	]
	logger.debug("Synthesizing " + str(texts))
	# Preprocess text inputs
	inputs = [text_to_sequence(text, hp.tts_cleaner_names) for text in texts]
	if not isinstance(embeddings, list):
	embeddings = [embeddings]

	# Batch inputs
	batched_inputs = [
	inputs[i : i + hp.synthesis_batch_size]
	for i in range(0, len(inputs), hp.synthesis_batch_size)
	]
	batched_embeds = [
	embeddings[i : i + hp.synthesis_batch_size]
	for i in range(0, len(embeddings), hp.synthesis_batch_size)
	]

	specs = []
	alignments = []
	for i, batch in enumerate(batched_inputs, 1):
	logger.debug(f"\n\| Generating {i}/{len(batched_inputs)}")

	# Pad texts so they are all the same length
	text_lens = [len(text) for text in batch]
	max_text_len = max(text_lens)
	chars = [pad1d(text, max_text_len) for text in batch]
	chars = np.stack(chars)

	# Stack speaker embeddings into 2D array for batch processing
	speaker_embeds = np.stack(batched_embeds[i - 1])

	# Convert to tensor
	chars = torch.tensor(chars).long().to(self.device)
	speaker_embeddings = torch.tensor(speaker_embeds).float().to(self.device)

	# Inference
	_, mels, alignments = self._model.generate(
	chars,
	speaker_embeddings,
	style_idx=style_idx,
	min_stop_token=min_stop_token,
	steps=steps,
	)
	mels = mels.detach().cpu().numpy()
	for m in mels:
	# Trim silence from end of each spectrogram
	while np.max(m[:, -1]) < hp.tts_stop_threshold:
	m = m[:, :-1]
	specs.append(m)

	logger.debug("\n\nDone.\n")
	return (specs, alignments) if return_alignments else specs

	@staticmethod
	def load_preprocess_wav(fpath):
	"""
	Loads and preprocesses an audio file under the same conditions the audio files were used to
	train the synthesizer.
	"""
	wav = librosa.load(path=str(fpath), sr=hp.sample_rate)[0]
	if hp.rescale:
	wav = wav / np.abs(wav).max() * hp.rescaling_max
	# denoise
	if len(wav) > hp.sample_rate * (0.3 + 0.1):
	noise_wav = np.concatenate(
	[
	wav[: int(hp.sample_rate * 0.15)],
	wav[-int(hp.sample_rate * 0.15) :],
	]
	)
	profile = profile_noise(noise_wav, hp.sample_rate)
	wav = denoise(wav, profile)
	return wav


	def pad1d(x, max_len, pad_value=0):
	return np.pad(x, (0, max_len - len(x)), mode="constant", constant_values=pad_value)