Spaces:
Runtime error
Runtime error
File size: 4,186 Bytes
8966d94 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 |
import re
import librosa
import numpy as np
from io import BytesIO
from pathlib import Path
from scipy.io import wavfile
from typing import List, Literal, Optional
from .encoder.inference import Encoder, preprocess_wav
from .synthesizer.inference import Synthesizer
from .vocoder.hifigan.inference import HifiGanVocoder
from .vocoder.wavernn.inference import WaveRNNVocoder
from .log import logger
def process_text(text: str) -> List[str]:
punctuation = "!,。、,?!," # punctuate and split/clean text
processed_texts = []
text = re.sub(r"[{}]+".format(punctuation), "\n", text)
for processed_text in text.split("\n"):
if processed_text:
processed_texts.append(processed_text.strip())
return processed_texts
class MockingBird:
def __init__(self):
self.encoder: Optional[Encoder] = None
self.gan_vocoder: Optional[HifiGanVocoder] = None
self.rnn_vocoder: Optional[WaveRNNVocoder] = None
self.synthesizer: Optional[Synthesizer] = None
def load_model(
self,
encoder_path: Path,
gan_vocoder_path: Optional[Path] = None,
rnn_vocoder_path: Optional[Path] = None,
):
"""
设置 Encoder模型 和 Vocoder模型 路径
Args:
encoder_path (Path): Encoder模型路径
gan_vocoder_path (Path): HifiGan Vocoder模型路径,可选,需要用到 HifiGan 类型时必须填写
rnn_vocoder_path (Path): WaveRNN Vocoder模型路径,可选,需要用到 WaveRNN 类型时必须填写
"""
self.encoder = Encoder(encoder_path)
if gan_vocoder_path:
self.gan_vocoder = HifiGanVocoder(gan_vocoder_path)
if rnn_vocoder_path:
self.rnn_vocoder = WaveRNNVocoder(rnn_vocoder_path)
def set_synthesizer(self, synthesizer_path: Path):
"""
设置Synthesizer模型路径
Args:
synthesizer_path (Path): Synthesizer模型路径
"""
self.synthesizer = Synthesizer(synthesizer_path)
logger.info(f"using synthesizer model: {synthesizer_path}")
def synthesize(
self,
text: str,
input_wav: Path,
vocoder_type: Literal["HifiGan", "WaveRNN"] = "HifiGan",
style_idx: int = 0,
min_stop_token: int = 5,
steps: int = 1000,
) -> BytesIO:
"""
生成语音
Args:
text (str): 目标文字
input_wav (Path): 目标录音路径
vocoder_type (HifiGan / WaveRNN): Vocoder模型,默认使用HifiGan
style_idx (int, optional): Style 范围 -1~9,默认为 0
min_stop_token (int, optional): Accuracy(精度) 范围3~9,默认为 5
steps (int, optional): MaxLength(最大句长) 范围200~2000,默认为 1000
"""
if not self.encoder:
raise Exception("Please set encoder path first")
if not self.synthesizer:
raise Exception("Please set synthesizer path first")
# Load input wav
wav, sample_rate = librosa.load(input_wav)
encoder_wav = preprocess_wav(wav, sample_rate)
embed, _, _ = self.encoder.embed_utterance(encoder_wav, return_partials=True)
# Load input text
texts = process_text(text)
# synthesize and vocode
embeds = [embed] * len(texts)
specs = self.synthesizer.synthesize_spectrograms(
texts,
embeds,
style_idx=style_idx,
min_stop_token=min_stop_token,
steps=steps,
)
spec = np.concatenate(specs, axis=1)
if vocoder_type == "WaveRNN":
if not self.rnn_vocoder:
raise Exception("Please set wavernn vocoder path first")
wav, sample_rate = self.rnn_vocoder.infer_waveform(spec)
else:
if not self.gan_vocoder:
raise Exception("Please set hifigan vocoder path first")
wav, sample_rate = self.gan_vocoder.infer_waveform(spec)
# Return cooked wav
out = BytesIO()
wavfile.write(out, sample_rate, wav.astype(np.float32))
return out
|