Spaces:
Build error
Build error
Update synthesizer/inference.py
Browse files- synthesizer/inference.py +166 -165
synthesizer/inference.py
CHANGED
|
@@ -1,165 +1,166 @@
|
|
| 1 |
-
import torch
|
| 2 |
-
from synthesizer import audio
|
| 3 |
-
from synthesizer.hparams import hparams
|
| 4 |
-
from synthesizer.models.tacotron import Tacotron
|
| 5 |
-
from synthesizer.utils.symbols import symbols
|
| 6 |
-
from synthesizer.utils.text import text_to_sequence
|
| 7 |
-
from vocoder.display import simple_table
|
| 8 |
-
from pathlib import Path
|
| 9 |
-
from typing import Union, List
|
| 10 |
-
import numpy as np
|
| 11 |
-
import librosa
|
| 12 |
-
|
| 13 |
-
|
| 14 |
-
class Synthesizer:
|
| 15 |
-
sample_rate = hparams.sample_rate
|
| 16 |
-
hparams = hparams
|
| 17 |
-
|
| 18 |
-
def __init__(self, model_fpath: Path, verbose=True):
|
| 19 |
-
"""
|
| 20 |
-
The model isn't instantiated and loaded in memory until needed or until load() is called.
|
| 21 |
-
|
| 22 |
-
:param model_fpath: path to the trained model file
|
| 23 |
-
:param verbose: if False, prints less information when using the model
|
| 24 |
-
"""
|
| 25 |
-
self.model_fpath = model_fpath
|
| 26 |
-
self.verbose = verbose
|
| 27 |
-
|
| 28 |
-
# Check for GPU
|
| 29 |
-
if torch.cuda.is_available():
|
| 30 |
-
self.device = torch.device("cuda")
|
| 31 |
-
else:
|
| 32 |
-
self.device = torch.device("cpu")
|
| 33 |
-
if self.verbose:
|
| 34 |
-
print("Synthesizer using device:", self.device)
|
| 35 |
-
|
| 36 |
-
# Tacotron model will be instantiated later on first use.
|
| 37 |
-
self._model = None
|
| 38 |
-
|
| 39 |
-
def is_loaded(self):
|
| 40 |
-
"""
|
| 41 |
-
Whether the model is loaded in memory.
|
| 42 |
-
"""
|
| 43 |
-
return self._model is not None
|
| 44 |
-
|
| 45 |
-
def load(self):
|
| 46 |
-
"""
|
| 47 |
-
Instantiates and loads the model given the weights file that was passed in the constructor.
|
| 48 |
-
"""
|
| 49 |
-
self._model = Tacotron(embed_dims=hparams.tts_embed_dims,
|
| 50 |
-
num_chars=len(symbols),
|
| 51 |
-
encoder_dims=hparams.tts_encoder_dims,
|
| 52 |
-
decoder_dims=hparams.tts_decoder_dims,
|
| 53 |
-
n_mels=hparams.num_mels,
|
| 54 |
-
fft_bins=hparams.num_mels,
|
| 55 |
-
postnet_dims=hparams.tts_postnet_dims,
|
| 56 |
-
encoder_K=hparams.tts_encoder_K,
|
| 57 |
-
lstm_dims=hparams.tts_lstm_dims,
|
| 58 |
-
postnet_K=hparams.tts_postnet_K,
|
| 59 |
-
num_highways=hparams.tts_num_highways,
|
| 60 |
-
dropout=hparams.tts_dropout,
|
| 61 |
-
stop_threshold=hparams.tts_stop_threshold,
|
| 62 |
-
speaker_embedding_size=hparams.speaker_embedding_size).to(self.device)
|
| 63 |
-
|
| 64 |
-
self._model.load(self.model_fpath)
|
| 65 |
-
self._model.eval()
|
| 66 |
-
|
| 67 |
-
if self.verbose:
|
| 68 |
-
print("Loaded synthesizer \"%s\" trained to step %d" % (self.model_fpath.name, self._model.state_dict()["step"]))
|
| 69 |
-
|
| 70 |
-
def synthesize_spectrograms(self, texts: List[str],
|
| 71 |
-
embeddings: Union[np.ndarray, List[np.ndarray]],
|
| 72 |
-
return_alignments=False):
|
| 73 |
-
"""
|
| 74 |
-
Synthesizes mel spectrograms from texts and speaker embeddings.
|
| 75 |
-
|
| 76 |
-
:param texts: a list of N text prompts to be synthesized
|
| 77 |
-
:param embeddings: a numpy array or list of speaker embeddings of shape (N, 256)
|
| 78 |
-
:param return_alignments: if True, a matrix representing the alignments between the
|
| 79 |
-
characters
|
| 80 |
-
and each decoder output step will be returned for each spectrogram
|
| 81 |
-
:return: a list of N melspectrograms as numpy arrays of shape (80, Mi), where Mi is the
|
| 82 |
-
sequence length of spectrogram i, and possibly the alignments.
|
| 83 |
-
"""
|
| 84 |
-
# Load the model on the first request.
|
| 85 |
-
if not self.is_loaded():
|
| 86 |
-
self.load()
|
| 87 |
-
|
| 88 |
-
# Preprocess text inputs
|
| 89 |
-
inputs = [text_to_sequence(text.strip(), hparams.tts_cleaner_names) for text in texts]
|
| 90 |
-
if not isinstance(embeddings, list):
|
| 91 |
-
embeddings = [embeddings]
|
| 92 |
-
|
| 93 |
-
# Batch inputs
|
| 94 |
-
batched_inputs = [inputs[i:i+hparams.synthesis_batch_size]
|
| 95 |
-
for i in range(0, len(inputs), hparams.synthesis_batch_size)]
|
| 96 |
-
batched_embeds = [embeddings[i:i+hparams.synthesis_batch_size]
|
| 97 |
-
for i in range(0, len(embeddings), hparams.synthesis_batch_size)]
|
| 98 |
-
|
| 99 |
-
specs = []
|
| 100 |
-
for i, batch in enumerate(batched_inputs, 1):
|
| 101 |
-
if self.verbose:
|
| 102 |
-
print(f"\n| Generating {i}/{len(batched_inputs)}")
|
| 103 |
-
|
| 104 |
-
# Pad texts so they are all the same length
|
| 105 |
-
text_lens = [len(text) for text in batch]
|
| 106 |
-
max_text_len = max(text_lens)
|
| 107 |
-
chars = [pad1d(text, max_text_len) for text in batch]
|
| 108 |
-
chars = np.stack(chars)
|
| 109 |
-
|
| 110 |
-
# Stack speaker embeddings into 2D array for batch processing
|
| 111 |
-
speaker_embeds = np.stack(batched_embeds[i-1])
|
| 112 |
-
|
| 113 |
-
# Convert to tensor
|
| 114 |
-
chars = torch.tensor(chars).long().to(self.device)
|
| 115 |
-
speaker_embeddings = torch.tensor(speaker_embeds).float().to(self.device)
|
| 116 |
-
|
| 117 |
-
# Inference
|
| 118 |
-
_, mels, alignments = self._model.generate(chars, speaker_embeddings)
|
| 119 |
-
mels = mels.detach().cpu().numpy()
|
| 120 |
-
for m in mels:
|
| 121 |
-
# Trim silence from end of each spectrogram
|
| 122 |
-
while np.max(m[:, -1]) < hparams.tts_stop_threshold:
|
| 123 |
-
m = m[:, :-1]
|
| 124 |
-
specs.append(m)
|
| 125 |
-
|
| 126 |
-
if self.verbose:
|
| 127 |
-
print("\n\nDone.\n")
|
| 128 |
-
return (specs, alignments) if return_alignments else specs
|
| 129 |
-
|
| 130 |
-
@staticmethod
|
| 131 |
-
def load_preprocess_wav(fpath):
|
| 132 |
-
"""
|
| 133 |
-
Loads and preprocesses an audio file under the same conditions the audio files were used to
|
| 134 |
-
train the synthesizer.
|
| 135 |
-
"""
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
|
| 149 |
-
|
| 150 |
-
|
| 151 |
-
|
| 152 |
-
|
| 153 |
-
|
| 154 |
-
|
| 155 |
-
|
| 156 |
-
|
| 157 |
-
|
| 158 |
-
|
| 159 |
-
|
| 160 |
-
|
| 161 |
-
|
| 162 |
-
|
| 163 |
-
|
| 164 |
-
|
| 165 |
-
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
+
from synthesizer import audio
|
| 3 |
+
from synthesizer.hparams import hparams
|
| 4 |
+
from synthesizer.models.tacotron import Tacotron
|
| 5 |
+
from synthesizer.utils.symbols import symbols
|
| 6 |
+
from synthesizer.utils.text import text_to_sequence
|
| 7 |
+
from vocoder.display import simple_table
|
| 8 |
+
from pathlib import Path
|
| 9 |
+
from typing import Union, List
|
| 10 |
+
import numpy as np
|
| 11 |
+
import librosa
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
class Synthesizer:
|
| 15 |
+
sample_rate = hparams.sample_rate
|
| 16 |
+
hparams = hparams
|
| 17 |
+
|
| 18 |
+
def __init__(self, model_fpath: Path, verbose=True):
|
| 19 |
+
"""
|
| 20 |
+
The model isn't instantiated and loaded in memory until needed or until load() is called.
|
| 21 |
+
|
| 22 |
+
:param model_fpath: path to the trained model file
|
| 23 |
+
:param verbose: if False, prints less information when using the model
|
| 24 |
+
"""
|
| 25 |
+
self.model_fpath = model_fpath
|
| 26 |
+
self.verbose = verbose
|
| 27 |
+
|
| 28 |
+
# Check for GPU
|
| 29 |
+
if torch.cuda.is_available():
|
| 30 |
+
self.device = torch.device("cuda")
|
| 31 |
+
else:
|
| 32 |
+
self.device = torch.device("cpu")
|
| 33 |
+
if self.verbose:
|
| 34 |
+
print("Synthesizer using device:", self.device)
|
| 35 |
+
|
| 36 |
+
# Tacotron model will be instantiated later on first use.
|
| 37 |
+
self._model = None
|
| 38 |
+
|
| 39 |
+
def is_loaded(self):
|
| 40 |
+
"""
|
| 41 |
+
Whether the model is loaded in memory.
|
| 42 |
+
"""
|
| 43 |
+
return self._model is not None
|
| 44 |
+
|
| 45 |
+
def load(self):
|
| 46 |
+
"""
|
| 47 |
+
Instantiates and loads the model given the weights file that was passed in the constructor.
|
| 48 |
+
"""
|
| 49 |
+
self._model = Tacotron(embed_dims=hparams.tts_embed_dims,
|
| 50 |
+
num_chars=len(symbols),
|
| 51 |
+
encoder_dims=hparams.tts_encoder_dims,
|
| 52 |
+
decoder_dims=hparams.tts_decoder_dims,
|
| 53 |
+
n_mels=hparams.num_mels,
|
| 54 |
+
fft_bins=hparams.num_mels,
|
| 55 |
+
postnet_dims=hparams.tts_postnet_dims,
|
| 56 |
+
encoder_K=hparams.tts_encoder_K,
|
| 57 |
+
lstm_dims=hparams.tts_lstm_dims,
|
| 58 |
+
postnet_K=hparams.tts_postnet_K,
|
| 59 |
+
num_highways=hparams.tts_num_highways,
|
| 60 |
+
dropout=hparams.tts_dropout,
|
| 61 |
+
stop_threshold=hparams.tts_stop_threshold,
|
| 62 |
+
speaker_embedding_size=hparams.speaker_embedding_size).to(self.device)
|
| 63 |
+
|
| 64 |
+
self._model.load(self.model_fpath)
|
| 65 |
+
self._model.eval()
|
| 66 |
+
|
| 67 |
+
if self.verbose:
|
| 68 |
+
print("Loaded synthesizer \"%s\" trained to step %d" % (self.model_fpath.name, self._model.state_dict()["step"]))
|
| 69 |
+
|
| 70 |
+
def synthesize_spectrograms(self, texts: List[str],
|
| 71 |
+
embeddings: Union[np.ndarray, List[np.ndarray]],
|
| 72 |
+
return_alignments=False):
|
| 73 |
+
"""
|
| 74 |
+
Synthesizes mel spectrograms from texts and speaker embeddings.
|
| 75 |
+
|
| 76 |
+
:param texts: a list of N text prompts to be synthesized
|
| 77 |
+
:param embeddings: a numpy array or list of speaker embeddings of shape (N, 256)
|
| 78 |
+
:param return_alignments: if True, a matrix representing the alignments between the
|
| 79 |
+
characters
|
| 80 |
+
and each decoder output step will be returned for each spectrogram
|
| 81 |
+
:return: a list of N melspectrograms as numpy arrays of shape (80, Mi), where Mi is the
|
| 82 |
+
sequence length of spectrogram i, and possibly the alignments.
|
| 83 |
+
"""
|
| 84 |
+
# Load the model on the first request.
|
| 85 |
+
if not self.is_loaded():
|
| 86 |
+
self.load()
|
| 87 |
+
|
| 88 |
+
# Preprocess text inputs
|
| 89 |
+
inputs = [text_to_sequence(text.strip(), hparams.tts_cleaner_names) for text in texts]
|
| 90 |
+
if not isinstance(embeddings, list):
|
| 91 |
+
embeddings = [embeddings]
|
| 92 |
+
|
| 93 |
+
# Batch inputs
|
| 94 |
+
batched_inputs = [inputs[i:i+hparams.synthesis_batch_size]
|
| 95 |
+
for i in range(0, len(inputs), hparams.synthesis_batch_size)]
|
| 96 |
+
batched_embeds = [embeddings[i:i+hparams.synthesis_batch_size]
|
| 97 |
+
for i in range(0, len(embeddings), hparams.synthesis_batch_size)]
|
| 98 |
+
|
| 99 |
+
specs = []
|
| 100 |
+
for i, batch in enumerate(batched_inputs, 1):
|
| 101 |
+
if self.verbose:
|
| 102 |
+
print(f"\n| Generating {i}/{len(batched_inputs)}")
|
| 103 |
+
|
| 104 |
+
# Pad texts so they are all the same length
|
| 105 |
+
text_lens = [len(text) for text in batch]
|
| 106 |
+
max_text_len = max(text_lens)
|
| 107 |
+
chars = [pad1d(text, max_text_len) for text in batch]
|
| 108 |
+
chars = np.stack(chars)
|
| 109 |
+
|
| 110 |
+
# Stack speaker embeddings into 2D array for batch processing
|
| 111 |
+
speaker_embeds = np.stack(batched_embeds[i-1])
|
| 112 |
+
|
| 113 |
+
# Convert to tensor
|
| 114 |
+
chars = torch.tensor(chars).long().to(self.device)
|
| 115 |
+
speaker_embeddings = torch.tensor(speaker_embeds).float().to(self.device)
|
| 116 |
+
|
| 117 |
+
# Inference
|
| 118 |
+
_, mels, alignments = self._model.generate(chars, speaker_embeddings)
|
| 119 |
+
mels = mels.detach().cpu().numpy()
|
| 120 |
+
for m in mels:
|
| 121 |
+
# Trim silence from end of each spectrogram
|
| 122 |
+
while np.max(m[:, -1]) < hparams.tts_stop_threshold:
|
| 123 |
+
m = m[:, :-1]
|
| 124 |
+
specs.append(m)
|
| 125 |
+
|
| 126 |
+
if self.verbose:
|
| 127 |
+
print("\n\nDone.\n")
|
| 128 |
+
return (specs, alignments) if return_alignments else specs
|
| 129 |
+
|
| 130 |
+
@staticmethod
|
| 131 |
+
def load_preprocess_wav(fpath):
|
| 132 |
+
"""
|
| 133 |
+
Loads and preprocesses an audio file under the same conditions the audio files were used to
|
| 134 |
+
train the synthesizer.
|
| 135 |
+
"""
|
| 136 |
+
print("Loading fpath and hparams.sample_rate :",str(fpath), hparams.sample_rate)
|
| 137 |
+
wav = librosa.load(str(fpath), hparams.sample_rate)[0]
|
| 138 |
+
if hparams.rescale:
|
| 139 |
+
wav = wav / np.abs(wav).max() * hparams.rescaling_max
|
| 140 |
+
return wav
|
| 141 |
+
|
| 142 |
+
@staticmethod
|
| 143 |
+
def make_spectrogram(fpath_or_wav: Union[str, Path, np.ndarray]):
|
| 144 |
+
"""
|
| 145 |
+
Creates a mel spectrogram from an audio file in the same manner as the mel spectrograms that
|
| 146 |
+
were fed to the synthesizer when training.
|
| 147 |
+
"""
|
| 148 |
+
if isinstance(fpath_or_wav, str) or isinstance(fpath_or_wav, Path):
|
| 149 |
+
wav = Synthesizer.load_preprocess_wav(fpath_or_wav)
|
| 150 |
+
else:
|
| 151 |
+
wav = fpath_or_wav
|
| 152 |
+
|
| 153 |
+
mel_spectrogram = audio.melspectrogram(wav, hparams).astype(np.float32)
|
| 154 |
+
return mel_spectrogram
|
| 155 |
+
|
| 156 |
+
@staticmethod
|
| 157 |
+
def griffin_lim(mel):
|
| 158 |
+
"""
|
| 159 |
+
Inverts a mel spectrogram using Griffin-Lim. The mel spectrogram is expected to have been built
|
| 160 |
+
with the same parameters present in hparams.py.
|
| 161 |
+
"""
|
| 162 |
+
return audio.inv_mel_spectrogram(mel, hparams)
|
| 163 |
+
|
| 164 |
+
|
| 165 |
+
def pad1d(x, max_len, pad_value=0):
|
| 166 |
+
return np.pad(x, (0, max_len - len(x)), mode="constant", constant_values=pad_value)
|