Spaces:
Runtime error
Runtime error
File size: 5,428 Bytes
8966d94 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 |
import torch
import librosa
import numpy as np
from pathlib import Path
from typing import Union, List
from pypinyin import lazy_pinyin, Style
from .hparams import hparams as hp
from .utils.symbols import symbols
from .models.tacotron import Tacotron
from .utils.text import text_to_sequence
from .utils.logmmse import denoise, profile_noise
from ..log import logger
class Synthesizer:
def __init__(self, model_path: Path):
# Check for GPU
if torch.cuda.is_available():
self.device = torch.device("cuda")
else:
self.device = torch.device("cpu")
logger.info(f"Synthesizer using device: {self.device}")
self._model = Tacotron(
embed_dims=hp.tts_embed_dims,
num_chars=len(symbols),
encoder_dims=hp.tts_encoder_dims,
decoder_dims=hp.tts_decoder_dims,
n_mels=hp.num_mels,
fft_bins=hp.num_mels,
postnet_dims=hp.tts_postnet_dims,
encoder_K=hp.tts_encoder_K,
lstm_dims=hp.tts_lstm_dims,
postnet_K=hp.tts_postnet_K,
num_highways=hp.tts_num_highways,
dropout=hp.tts_dropout,
stop_threshold=hp.tts_stop_threshold,
speaker_embedding_size=hp.speaker_embedding_size,
).to(self.device)
self._model.load(model_path, self.device)
self._model.eval()
logger.info(
'Loaded synthesizer "%s" trained to step %d'
% (model_path.name, self._model.state_dict()["step"])
)
def synthesize_spectrograms(
self,
texts: List[str],
embeddings: Union[np.ndarray, List[np.ndarray]],
return_alignments=False,
style_idx=0,
min_stop_token=5,
steps=2000,
):
"""
Synthesizes mel spectrograms from texts and speaker embeddings.
:param texts: a list of N text prompts to be synthesized
:param embeddings: a numpy array or list of speaker embeddings of shape (N, 256)
:param return_alignments: if True, a matrix representing the alignments between the
characters
and each decoder output step will be returned for each spectrogram
:return: a list of N melspectrograms as numpy arrays of shape (80, Mi), where Mi is the
sequence length of spectrogram i, and possibly the alignments.
"""
logger.debug("Read " + str(texts))
texts = [
" ".join(lazy_pinyin(v, style=Style.TONE3, neutral_tone_with_five=True))
for v in texts
]
logger.debug("Synthesizing " + str(texts))
# Preprocess text inputs
inputs = [text_to_sequence(text, hp.tts_cleaner_names) for text in texts]
if not isinstance(embeddings, list):
embeddings = [embeddings]
# Batch inputs
batched_inputs = [
inputs[i : i + hp.synthesis_batch_size]
for i in range(0, len(inputs), hp.synthesis_batch_size)
]
batched_embeds = [
embeddings[i : i + hp.synthesis_batch_size]
for i in range(0, len(embeddings), hp.synthesis_batch_size)
]
specs = []
alignments = []
for i, batch in enumerate(batched_inputs, 1):
logger.debug(f"\n| Generating {i}/{len(batched_inputs)}")
# Pad texts so they are all the same length
text_lens = [len(text) for text in batch]
max_text_len = max(text_lens)
chars = [pad1d(text, max_text_len) for text in batch]
chars = np.stack(chars)
# Stack speaker embeddings into 2D array for batch processing
speaker_embeds = np.stack(batched_embeds[i - 1])
# Convert to tensor
chars = torch.tensor(chars).long().to(self.device)
speaker_embeddings = torch.tensor(speaker_embeds).float().to(self.device)
# Inference
_, mels, alignments = self._model.generate(
chars,
speaker_embeddings,
style_idx=style_idx,
min_stop_token=min_stop_token,
steps=steps,
)
mels = mels.detach().cpu().numpy()
for m in mels:
# Trim silence from end of each spectrogram
while np.max(m[:, -1]) < hp.tts_stop_threshold:
m = m[:, :-1]
specs.append(m)
logger.debug("\n\nDone.\n")
return (specs, alignments) if return_alignments else specs
@staticmethod
def load_preprocess_wav(fpath):
"""
Loads and preprocesses an audio file under the same conditions the audio files were used to
train the synthesizer.
"""
wav = librosa.load(path=str(fpath), sr=hp.sample_rate)[0]
if hp.rescale:
wav = wav / np.abs(wav).max() * hp.rescaling_max
# denoise
if len(wav) > hp.sample_rate * (0.3 + 0.1):
noise_wav = np.concatenate(
[
wav[: int(hp.sample_rate * 0.15)],
wav[-int(hp.sample_rate * 0.15) :],
]
)
profile = profile_noise(noise_wav, hp.sample_rate)
wav = denoise(wav, profile)
return wav
def pad1d(x, max_len, pad_value=0):
return np.pad(x, (0, max_len - len(x)), mode="constant", constant_values=pad_value)
|