File size: 5,428 Bytes
8966d94
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
import torch
import librosa
import numpy as np
from pathlib import Path
from typing import Union, List
from pypinyin import lazy_pinyin, Style

from .hparams import hparams as hp
from .utils.symbols import symbols
from .models.tacotron import Tacotron
from .utils.text import text_to_sequence
from .utils.logmmse import denoise, profile_noise
from ..log import logger


class Synthesizer:
    def __init__(self, model_path: Path):
        # Check for GPU
        if torch.cuda.is_available():
            self.device = torch.device("cuda")
        else:
            self.device = torch.device("cpu")
        logger.info(f"Synthesizer using device: {self.device}")

        self._model = Tacotron(
            embed_dims=hp.tts_embed_dims,
            num_chars=len(symbols),
            encoder_dims=hp.tts_encoder_dims,
            decoder_dims=hp.tts_decoder_dims,
            n_mels=hp.num_mels,
            fft_bins=hp.num_mels,
            postnet_dims=hp.tts_postnet_dims,
            encoder_K=hp.tts_encoder_K,
            lstm_dims=hp.tts_lstm_dims,
            postnet_K=hp.tts_postnet_K,
            num_highways=hp.tts_num_highways,
            dropout=hp.tts_dropout,
            stop_threshold=hp.tts_stop_threshold,
            speaker_embedding_size=hp.speaker_embedding_size,
        ).to(self.device)

        self._model.load(model_path, self.device)
        self._model.eval()

        logger.info(
            'Loaded synthesizer "%s" trained to step %d'
            % (model_path.name, self._model.state_dict()["step"])
        )

    def synthesize_spectrograms(
        self,
        texts: List[str],
        embeddings: Union[np.ndarray, List[np.ndarray]],
        return_alignments=False,
        style_idx=0,
        min_stop_token=5,
        steps=2000,
    ):
        """
        Synthesizes mel spectrograms from texts and speaker embeddings.

        :param texts: a list of N text prompts to be synthesized
        :param embeddings: a numpy array or list of speaker embeddings of shape (N, 256)
        :param return_alignments: if True, a matrix representing the alignments between the
        characters
        and each decoder output step will be returned for each spectrogram
        :return: a list of N melspectrograms as numpy arrays of shape (80, Mi), where Mi is the
        sequence length of spectrogram i, and possibly the alignments.
        """

        logger.debug("Read " + str(texts))
        texts = [
            " ".join(lazy_pinyin(v, style=Style.TONE3, neutral_tone_with_five=True))
            for v in texts
        ]
        logger.debug("Synthesizing " + str(texts))
        # Preprocess text inputs
        inputs = [text_to_sequence(text, hp.tts_cleaner_names) for text in texts]
        if not isinstance(embeddings, list):
            embeddings = [embeddings]

        # Batch inputs
        batched_inputs = [
            inputs[i : i + hp.synthesis_batch_size]
            for i in range(0, len(inputs), hp.synthesis_batch_size)
        ]
        batched_embeds = [
            embeddings[i : i + hp.synthesis_batch_size]
            for i in range(0, len(embeddings), hp.synthesis_batch_size)
        ]

        specs = []
        alignments = []
        for i, batch in enumerate(batched_inputs, 1):
            logger.debug(f"\n| Generating {i}/{len(batched_inputs)}")

            # Pad texts so they are all the same length
            text_lens = [len(text) for text in batch]
            max_text_len = max(text_lens)
            chars = [pad1d(text, max_text_len) for text in batch]
            chars = np.stack(chars)

            # Stack speaker embeddings into 2D array for batch processing
            speaker_embeds = np.stack(batched_embeds[i - 1])

            # Convert to tensor
            chars = torch.tensor(chars).long().to(self.device)
            speaker_embeddings = torch.tensor(speaker_embeds).float().to(self.device)

            # Inference
            _, mels, alignments = self._model.generate(
                chars,
                speaker_embeddings,
                style_idx=style_idx,
                min_stop_token=min_stop_token,
                steps=steps,
            )
            mels = mels.detach().cpu().numpy()
            for m in mels:
                # Trim silence from end of each spectrogram
                while np.max(m[:, -1]) < hp.tts_stop_threshold:
                    m = m[:, :-1]
                specs.append(m)

        logger.debug("\n\nDone.\n")
        return (specs, alignments) if return_alignments else specs

    @staticmethod
    def load_preprocess_wav(fpath):
        """
        Loads and preprocesses an audio file under the same conditions the audio files were used to
        train the synthesizer.
        """
        wav = librosa.load(path=str(fpath), sr=hp.sample_rate)[0]
        if hp.rescale:
            wav = wav / np.abs(wav).max() * hp.rescaling_max
        # denoise
        if len(wav) > hp.sample_rate * (0.3 + 0.1):
            noise_wav = np.concatenate(
                [
                    wav[: int(hp.sample_rate * 0.15)],
                    wav[-int(hp.sample_rate * 0.15) :],
                ]
            )
            profile = profile_noise(noise_wav, hp.sample_rate)
            wav = denoise(wav, profile)
        return wav


def pad1d(x, max_len, pad_value=0):
    return np.pad(x, (0, max_len - len(x)), mode="constant", constant_values=pad_value)