Spaces:

Marne
/

MockingBird

Runtime error

File size: 5,428 Bytes

8966d94

import torch
import librosa
import numpy as np
from pathlib import Path
from typing import Union, List
from pypinyin import lazy_pinyin, Style

from .hparams import hparams as hp
from .utils.symbols import symbols
from .models.tacotron import Tacotron
from .utils.text import text_to_sequence
from .utils.logmmse import denoise, profile_noise
from ..log import logger


class Synthesizer:
    def __init__(self, model_path: Path):
        # Check for GPU
        if torch.cuda.is_available():
            self.device = torch.device("cuda")
        else:
            self.device = torch.device("cpu")
        logger.info(f"Synthesizer using device: {self.device}")

        self._model = Tacotron(
            embed_dims=hp.tts_embed_dims,
            num_chars=len(symbols),
            encoder_dims=hp.tts_encoder_dims,
            decoder_dims=hp.tts_decoder_dims,
            n_mels=hp.num_mels,
            fft_bins=hp.num_mels,
            postnet_dims=hp.tts_postnet_dims,
            encoder_K=hp.tts_encoder_K,
            lstm_dims=hp.tts_lstm_dims,
            postnet_K=hp.tts_postnet_K,
            num_highways=hp.tts_num_highways,
            dropout=hp.tts_dropout,
            stop_threshold=hp.tts_stop_threshold,
            speaker_embedding_size=hp.speaker_embedding_size,
        ).to(self.device)

        self._model.load(model_path, self.device)
        self._model.eval()

        logger.info(
            'Loaded synthesizer "%s" trained to step %d'
            % (model_path.name, self._model.state_dict()["step"])
        )

    def synthesize_spectrograms(
        self,
        texts: List[str],
        embeddings: Union[np.ndarray, List[np.ndarray]],
        return_alignments=False,
        style_idx=0,
        min_stop_token=5,
        steps=2000,
    ):
        """
        Synthesizes mel spectrograms from texts and speaker embeddings.

        :param texts: a list of N text prompts to be synthesized
        :param embeddings: a numpy array or list of speaker embeddings of shape (N, 256)
        :param return_alignments: if True, a matrix representing the alignments between the
        characters
        and each decoder output step will be returned for each spectrogram
        :return: a list of N melspectrograms as numpy arrays of shape (80, Mi), where Mi is the
        sequence length of spectrogram i, and possibly the alignments.
        """

        logger.debug("Read " + str(texts))
        texts = [
            " ".join(lazy_pinyin(v, style=Style.TONE3, neutral_tone_with_five=True))
            for v in texts
        ]
        logger.debug("Synthesizing " + str(texts))
        # Preprocess text inputs
        inputs = [text_to_sequence(text, hp.tts_cleaner_names) for text in texts]
        if not isinstance(embeddings, list):
            embeddings = [embeddings]

        # Batch inputs
        batched_inputs = [
            inputs[i : i + hp.synthesis_batch_size]
            for i in range(0, len(inputs), hp.synthesis_batch_size)
        ]
        batched_embeds = [
            embeddings[i : i + hp.synthesis_batch_size]
            for i in range(0, len(embeddings), hp.synthesis_batch_size)
        ]

        specs = []
        alignments = []
        for i, batch in enumerate(batched_inputs, 1):
            logger.debug(f"\n| Generating {i}/{len(batched_inputs)}")

            # Pad texts so they are all the same length
            text_lens = [len(text) for text in batch]
            max_text_len = max(text_lens)
            chars = [pad1d(text, max_text_len) for text in batch]
            chars = np.stack(chars)

            # Stack speaker embeddings into 2D array for batch processing
            speaker_embeds = np.stack(batched_embeds[i - 1])

            # Convert to tensor
            chars = torch.tensor(chars).long().to(self.device)
            speaker_embeddings = torch.tensor(speaker_embeds).float().to(self.device)

            # Inference
            _, mels, alignments = self._model.generate(
                chars,
                speaker_embeddings,
                style_idx=style_idx,
                min_stop_token=min_stop_token,
                steps=steps,
            )
            mels = mels.detach().cpu().numpy()
            for m in mels:
                # Trim silence from end of each spectrogram
                while np.max(m[:, -1]) < hp.tts_stop_threshold:
                    m = m[:, :-1]
                specs.append(m)

        logger.debug("\n\nDone.\n")
        return (specs, alignments) if return_alignments else specs

    @staticmethod
    def load_preprocess_wav(fpath):
        """
        Loads and preprocesses an audio file under the same conditions the audio files were used to
        train the synthesizer.
        """
        wav = librosa.load(path=str(fpath), sr=hp.sample_rate)[0]
        if hp.rescale:
            wav = wav / np.abs(wav).max() * hp.rescaling_max
        # denoise
        if len(wav) > hp.sample_rate * (0.3 + 0.1):
            noise_wav = np.concatenate(
                [
                    wav[: int(hp.sample_rate * 0.15)],
                    wav[-int(hp.sample_rate * 0.15) :],
                ]
            )
            profile = profile_noise(noise_wav, hp.sample_rate)
            wav = denoise(wav, profile)
        return wav


def pad1d(x, max_len, pad_value=0):
    return np.pad(x, (0, max_len - len(x)), mode="constant", constant_values=pad_value)