AkashiCoin
Add dir MockingBirdForUse
8966d94
import torch
import librosa
import numpy as np
from pathlib import Path
from typing import Union, List
from pypinyin import lazy_pinyin, Style
from .hparams import hparams as hp
from .utils.symbols import symbols
from .models.tacotron import Tacotron
from .utils.text import text_to_sequence
from .utils.logmmse import denoise, profile_noise
from ..log import logger
class Synthesizer:
def __init__(self, model_path: Path):
# Check for GPU
if torch.cuda.is_available():
self.device = torch.device("cuda")
else:
self.device = torch.device("cpu")
logger.info(f"Synthesizer using device: {self.device}")
self._model = Tacotron(
embed_dims=hp.tts_embed_dims,
num_chars=len(symbols),
encoder_dims=hp.tts_encoder_dims,
decoder_dims=hp.tts_decoder_dims,
n_mels=hp.num_mels,
fft_bins=hp.num_mels,
postnet_dims=hp.tts_postnet_dims,
encoder_K=hp.tts_encoder_K,
lstm_dims=hp.tts_lstm_dims,
postnet_K=hp.tts_postnet_K,
num_highways=hp.tts_num_highways,
dropout=hp.tts_dropout,
stop_threshold=hp.tts_stop_threshold,
speaker_embedding_size=hp.speaker_embedding_size,
).to(self.device)
self._model.load(model_path, self.device)
self._model.eval()
logger.info(
'Loaded synthesizer "%s" trained to step %d'
% (model_path.name, self._model.state_dict()["step"])
)
def synthesize_spectrograms(
self,
texts: List[str],
embeddings: Union[np.ndarray, List[np.ndarray]],
return_alignments=False,
style_idx=0,
min_stop_token=5,
steps=2000,
):
"""
Synthesizes mel spectrograms from texts and speaker embeddings.
:param texts: a list of N text prompts to be synthesized
:param embeddings: a numpy array or list of speaker embeddings of shape (N, 256)
:param return_alignments: if True, a matrix representing the alignments between the
characters
and each decoder output step will be returned for each spectrogram
:return: a list of N melspectrograms as numpy arrays of shape (80, Mi), where Mi is the
sequence length of spectrogram i, and possibly the alignments.
"""
logger.debug("Read " + str(texts))
texts = [
" ".join(lazy_pinyin(v, style=Style.TONE3, neutral_tone_with_five=True))
for v in texts
]
logger.debug("Synthesizing " + str(texts))
# Preprocess text inputs
inputs = [text_to_sequence(text, hp.tts_cleaner_names) for text in texts]
if not isinstance(embeddings, list):
embeddings = [embeddings]
# Batch inputs
batched_inputs = [
inputs[i : i + hp.synthesis_batch_size]
for i in range(0, len(inputs), hp.synthesis_batch_size)
]
batched_embeds = [
embeddings[i : i + hp.synthesis_batch_size]
for i in range(0, len(embeddings), hp.synthesis_batch_size)
]
specs = []
alignments = []
for i, batch in enumerate(batched_inputs, 1):
logger.debug(f"\n| Generating {i}/{len(batched_inputs)}")
# Pad texts so they are all the same length
text_lens = [len(text) for text in batch]
max_text_len = max(text_lens)
chars = [pad1d(text, max_text_len) for text in batch]
chars = np.stack(chars)
# Stack speaker embeddings into 2D array for batch processing
speaker_embeds = np.stack(batched_embeds[i - 1])
# Convert to tensor
chars = torch.tensor(chars).long().to(self.device)
speaker_embeddings = torch.tensor(speaker_embeds).float().to(self.device)
# Inference
_, mels, alignments = self._model.generate(
chars,
speaker_embeddings,
style_idx=style_idx,
min_stop_token=min_stop_token,
steps=steps,
)
mels = mels.detach().cpu().numpy()
for m in mels:
# Trim silence from end of each spectrogram
while np.max(m[:, -1]) < hp.tts_stop_threshold:
m = m[:, :-1]
specs.append(m)
logger.debug("\n\nDone.\n")
return (specs, alignments) if return_alignments else specs
@staticmethod
def load_preprocess_wav(fpath):
"""
Loads and preprocesses an audio file under the same conditions the audio files were used to
train the synthesizer.
"""
wav = librosa.load(path=str(fpath), sr=hp.sample_rate)[0]
if hp.rescale:
wav = wav / np.abs(wav).max() * hp.rescaling_max
# denoise
if len(wav) > hp.sample_rate * (0.3 + 0.1):
noise_wav = np.concatenate(
[
wav[: int(hp.sample_rate * 0.15)],
wav[-int(hp.sample_rate * 0.15) :],
]
)
profile = profile_noise(noise_wav, hp.sample_rate)
wav = denoise(wav, profile)
return wav
def pad1d(x, max_len, pad_value=0):
return np.pad(x, (0, max_len - len(x)), mode="constant", constant_values=pad_value)