File size: 4,186 Bytes
8966d94
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
import re
import librosa
import numpy as np
from io import BytesIO
from pathlib import Path
from scipy.io import wavfile
from typing import List, Literal, Optional

from .encoder.inference import Encoder, preprocess_wav
from .synthesizer.inference import Synthesizer
from .vocoder.hifigan.inference import HifiGanVocoder
from .vocoder.wavernn.inference import WaveRNNVocoder
from .log import logger


def process_text(text: str) -> List[str]:
    punctuation = "!,。、,?!,"  # punctuate and split/clean text
    processed_texts = []
    text = re.sub(r"[{}]+".format(punctuation), "\n", text)
    for processed_text in text.split("\n"):
        if processed_text:
            processed_texts.append(processed_text.strip())
    return processed_texts


class MockingBird:
    def __init__(self):
        self.encoder: Optional[Encoder] = None
        self.gan_vocoder: Optional[HifiGanVocoder] = None
        self.rnn_vocoder: Optional[WaveRNNVocoder] = None
        self.synthesizer: Optional[Synthesizer] = None

    def load_model(
        self,
        encoder_path: Path,
        gan_vocoder_path: Optional[Path] = None,
        rnn_vocoder_path: Optional[Path] = None,
    ):
        """
        设置 Encoder模型 和 Vocoder模型 路径

        Args:
            encoder_path (Path): Encoder模型路径
            gan_vocoder_path (Path): HifiGan Vocoder模型路径,可选,需要用到 HifiGan 类型时必须填写
            rnn_vocoder_path (Path): WaveRNN Vocoder模型路径,可选,需要用到 WaveRNN 类型时必须填写
        """
        self.encoder = Encoder(encoder_path)
        if gan_vocoder_path:
            self.gan_vocoder = HifiGanVocoder(gan_vocoder_path)
        if rnn_vocoder_path:
            self.rnn_vocoder = WaveRNNVocoder(rnn_vocoder_path)

    def set_synthesizer(self, synthesizer_path: Path):
        """
        设置Synthesizer模型路径

        Args:
            synthesizer_path (Path): Synthesizer模型路径
        """
        self.synthesizer = Synthesizer(synthesizer_path)
        logger.info(f"using synthesizer model: {synthesizer_path}")

    def synthesize(
        self,
        text: str,
        input_wav: Path,
        vocoder_type: Literal["HifiGan", "WaveRNN"] = "HifiGan",
        style_idx: int = 0,
        min_stop_token: int = 5,
        steps: int = 1000,
    ) -> BytesIO:
        """
        生成语音

        Args:
            text (str): 目标文字
            input_wav (Path): 目标录音路径
            vocoder_type (HifiGan / WaveRNN): Vocoder模型,默认使用HifiGan
            style_idx (int, optional): Style 范围 -1~9,默认为 0
            min_stop_token (int, optional): Accuracy(精度) 范围3~9,默认为 5
            steps (int, optional): MaxLength(最大句长) 范围200~2000,默认为 1000
        """
        if not self.encoder:
            raise Exception("Please set encoder path first")

        if not self.synthesizer:
            raise Exception("Please set synthesizer path first")

        # Load input wav
        wav, sample_rate = librosa.load(input_wav)

        encoder_wav = preprocess_wav(wav, sample_rate)
        embed, _, _ = self.encoder.embed_utterance(encoder_wav, return_partials=True)

        # Load input text
        texts = process_text(text)

        # synthesize and vocode
        embeds = [embed] * len(texts)
        specs = self.synthesizer.synthesize_spectrograms(
            texts,
            embeds,
            style_idx=style_idx,
            min_stop_token=min_stop_token,
            steps=steps,
        )
        spec = np.concatenate(specs, axis=1)
        if vocoder_type == "WaveRNN":
            if not self.rnn_vocoder:
                raise Exception("Please set wavernn vocoder path first")
            wav, sample_rate = self.rnn_vocoder.infer_waveform(spec)
        else:
            if not self.gan_vocoder:
                raise Exception("Please set hifigan vocoder path first")
            wav, sample_rate = self.gan_vocoder.infer_waveform(spec)

        # Return cooked wav
        out = BytesIO()
        wavfile.write(out, sample_rate, wav.astype(np.float32))
        return out