Spaces:
Runtime error
Runtime error
| import torch | |
| import librosa | |
| import numpy as np | |
| from pathlib import Path | |
| from typing import Union, List | |
| from pypinyin import lazy_pinyin, Style | |
| from .hparams import hparams as hp | |
| from .utils.symbols import symbols | |
| from .models.tacotron import Tacotron | |
| from .utils.text import text_to_sequence | |
| from .utils.logmmse import denoise, profile_noise | |
| from ..log import logger | |
| class Synthesizer: | |
| def __init__(self, model_path: Path): | |
| # Check for GPU | |
| if torch.cuda.is_available(): | |
| self.device = torch.device("cuda") | |
| else: | |
| self.device = torch.device("cpu") | |
| logger.info(f"Synthesizer using device: {self.device}") | |
| self._model = Tacotron( | |
| embed_dims=hp.tts_embed_dims, | |
| num_chars=len(symbols), | |
| encoder_dims=hp.tts_encoder_dims, | |
| decoder_dims=hp.tts_decoder_dims, | |
| n_mels=hp.num_mels, | |
| fft_bins=hp.num_mels, | |
| postnet_dims=hp.tts_postnet_dims, | |
| encoder_K=hp.tts_encoder_K, | |
| lstm_dims=hp.tts_lstm_dims, | |
| postnet_K=hp.tts_postnet_K, | |
| num_highways=hp.tts_num_highways, | |
| dropout=hp.tts_dropout, | |
| stop_threshold=hp.tts_stop_threshold, | |
| speaker_embedding_size=hp.speaker_embedding_size, | |
| ).to(self.device) | |
| self._model.load(model_path, self.device) | |
| self._model.eval() | |
| logger.info( | |
| 'Loaded synthesizer "%s" trained to step %d' | |
| % (model_path.name, self._model.state_dict()["step"]) | |
| ) | |
| def synthesize_spectrograms( | |
| self, | |
| texts: List[str], | |
| embeddings: Union[np.ndarray, List[np.ndarray]], | |
| return_alignments=False, | |
| style_idx=0, | |
| min_stop_token=5, | |
| steps=2000, | |
| ): | |
| """ | |
| Synthesizes mel spectrograms from texts and speaker embeddings. | |
| :param texts: a list of N text prompts to be synthesized | |
| :param embeddings: a numpy array or list of speaker embeddings of shape (N, 256) | |
| :param return_alignments: if True, a matrix representing the alignments between the | |
| characters | |
| and each decoder output step will be returned for each spectrogram | |
| :return: a list of N melspectrograms as numpy arrays of shape (80, Mi), where Mi is the | |
| sequence length of spectrogram i, and possibly the alignments. | |
| """ | |
| logger.debug("Read " + str(texts)) | |
| texts = [ | |
| " ".join(lazy_pinyin(v, style=Style.TONE3, neutral_tone_with_five=True)) | |
| for v in texts | |
| ] | |
| logger.debug("Synthesizing " + str(texts)) | |
| # Preprocess text inputs | |
| inputs = [text_to_sequence(text, hp.tts_cleaner_names) for text in texts] | |
| if not isinstance(embeddings, list): | |
| embeddings = [embeddings] | |
| # Batch inputs | |
| batched_inputs = [ | |
| inputs[i : i + hp.synthesis_batch_size] | |
| for i in range(0, len(inputs), hp.synthesis_batch_size) | |
| ] | |
| batched_embeds = [ | |
| embeddings[i : i + hp.synthesis_batch_size] | |
| for i in range(0, len(embeddings), hp.synthesis_batch_size) | |
| ] | |
| specs = [] | |
| alignments = [] | |
| for i, batch in enumerate(batched_inputs, 1): | |
| logger.debug(f"\n| Generating {i}/{len(batched_inputs)}") | |
| # Pad texts so they are all the same length | |
| text_lens = [len(text) for text in batch] | |
| max_text_len = max(text_lens) | |
| chars = [pad1d(text, max_text_len) for text in batch] | |
| chars = np.stack(chars) | |
| # Stack speaker embeddings into 2D array for batch processing | |
| speaker_embeds = np.stack(batched_embeds[i - 1]) | |
| # Convert to tensor | |
| chars = torch.tensor(chars).long().to(self.device) | |
| speaker_embeddings = torch.tensor(speaker_embeds).float().to(self.device) | |
| # Inference | |
| _, mels, alignments = self._model.generate( | |
| chars, | |
| speaker_embeddings, | |
| style_idx=style_idx, | |
| min_stop_token=min_stop_token, | |
| steps=steps, | |
| ) | |
| mels = mels.detach().cpu().numpy() | |
| for m in mels: | |
| # Trim silence from end of each spectrogram | |
| while np.max(m[:, -1]) < hp.tts_stop_threshold: | |
| m = m[:, :-1] | |
| specs.append(m) | |
| logger.debug("\n\nDone.\n") | |
| return (specs, alignments) if return_alignments else specs | |
| def load_preprocess_wav(fpath): | |
| """ | |
| Loads and preprocesses an audio file under the same conditions the audio files were used to | |
| train the synthesizer. | |
| """ | |
| wav = librosa.load(path=str(fpath), sr=hp.sample_rate)[0] | |
| if hp.rescale: | |
| wav = wav / np.abs(wav).max() * hp.rescaling_max | |
| # denoise | |
| if len(wav) > hp.sample_rate * (0.3 + 0.1): | |
| noise_wav = np.concatenate( | |
| [ | |
| wav[: int(hp.sample_rate * 0.15)], | |
| wav[-int(hp.sample_rate * 0.15) :], | |
| ] | |
| ) | |
| profile = profile_noise(noise_wav, hp.sample_rate) | |
| wav = denoise(wav, profile) | |
| return wav | |
| def pad1d(x, max_len, pad_value=0): | |
| return np.pad(x, (0, max_len - len(x)), mode="constant", constant_values=pad_value) | |