Commit
·
e059497
1
Parent(s):
8bf7a01
seperate style calculation process
Browse files- inference.py +17 -18
- run.ipynb +0 -0
inference.py
CHANGED
|
@@ -33,7 +33,7 @@ def espeak_phn(text, lang):
|
|
| 33 |
print(e)
|
| 34 |
|
| 35 |
# IPA Phonemizer: https://github.com/bootphon/phonemizer
|
| 36 |
-
# Total including extend chars
|
| 37 |
|
| 38 |
_pad = "$"
|
| 39 |
_punctuation = ';:,.!?¡¿—…"«»“” '
|
|
@@ -135,9 +135,6 @@ class StyleTTS2(torch.nn.Module):
|
|
| 135 |
self.style_encoder = StyleEncoder(dim_in=args.dim_in, style_dim=args.style_dim, max_conv_dim=args.hidden_dim)# acoustic style encoder
|
| 136 |
|
| 137 |
self.__load_models(models_path)
|
| 138 |
-
|
| 139 |
-
self.ref_s_speakers = None
|
| 140 |
-
self.speakers = None
|
| 141 |
|
| 142 |
def __recursive_munch(self, d):
|
| 143 |
if isinstance(d, dict):
|
|
@@ -274,21 +271,23 @@ class StyleTTS2(torch.nn.Module):
|
|
| 274 |
|
| 275 |
return out.squeeze().cpu().numpy(), duration.mean()
|
| 276 |
|
| 277 |
-
def
|
| 278 |
-
self.ref_s_speakers = {}
|
| 279 |
-
self.speakers = speakers
|
| 280 |
-
for id in speakers:
|
| 281 |
-
ref_s = self.__compute_style(speakers[id]['path'], denoise=denoise, split_dur=split_dur)
|
| 282 |
-
self.ref_s_speakers[id] = ref_s
|
| 283 |
-
|
| 284 |
-
def generate(self, text, speakers, avg_style=False, stabilize=False, denoise=0.3, n_merge=14, default_speaker= "[id_1]"):
|
| 285 |
if avg_style: split_dur = 3
|
| 286 |
else: split_dur = 0
|
| 287 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 288 |
if stabilize: smooth_dur=0.2
|
| 289 |
else: smooth_dur=0
|
| 290 |
-
|
| 291 |
-
self.__get_styles(speakers, denoise, split_dur)
|
| 292 |
|
| 293 |
list_wav = []
|
| 294 |
prev_d_mean = 0
|
|
@@ -324,8 +323,8 @@ class StyleTTS2(torch.nn.Module):
|
|
| 324 |
if bool(re.match(r'(\[id_\d+\])', i)):
|
| 325 |
#Set up env for matched speaker
|
| 326 |
speaker_id = i.strip('[]')
|
| 327 |
-
current_ref_s =
|
| 328 |
-
speed =
|
| 329 |
continue
|
| 330 |
text_norm = self.preprocess.text_preprocess(i, n_merge=n_merge)
|
| 331 |
for sentence in text_norm:
|
|
@@ -340,7 +339,7 @@ class StyleTTS2(torch.nn.Module):
|
|
| 340 |
print(e)
|
| 341 |
|
| 342 |
replacement_func = self.__init_replacement_func(cus_phonem)
|
| 343 |
-
phonem = espeak_phn(sentence,
|
| 344 |
phonem = re.sub(lang_pattern, replacement_func, phonem)
|
| 345 |
|
| 346 |
wav, prev_d_mean = self.__inference(phonem, current_ref_s, speed=speed, prev_d_mean=prev_d_mean, t=smooth_dur)
|
|
|
|
| 33 |
print(e)
|
| 34 |
|
| 35 |
# IPA Phonemizer: https://github.com/bootphon/phonemizer
|
| 36 |
+
# Total including extend chars 189
|
| 37 |
|
| 38 |
_pad = "$"
|
| 39 |
_punctuation = ';:,.!?¡¿—…"«»“” '
|
|
|
|
| 135 |
self.style_encoder = StyleEncoder(dim_in=args.dim_in, style_dim=args.style_dim, max_conv_dim=args.hidden_dim)# acoustic style encoder
|
| 136 |
|
| 137 |
self.__load_models(models_path)
|
|
|
|
|
|
|
|
|
|
| 138 |
|
| 139 |
def __recursive_munch(self, d):
|
| 140 |
if isinstance(d, dict):
|
|
|
|
| 271 |
|
| 272 |
return out.squeeze().cpu().numpy(), duration.mean()
|
| 273 |
|
| 274 |
+
def get_styles(self, speakers, denoise=0.3, avg_style=True):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 275 |
if avg_style: split_dur = 3
|
| 276 |
else: split_dur = 0
|
| 277 |
+
styles = {}
|
| 278 |
+
for id in speakers:
|
| 279 |
+
ref_s = self.__compute_style(speakers[id]['path'], denoise=denoise, split_dur=split_dur)
|
| 280 |
+
styles[id] = {
|
| 281 |
+
'style': ref_s,
|
| 282 |
+
'path': speakers[id]['path'],
|
| 283 |
+
'lang': speakers[id]['lang'],
|
| 284 |
+
'speed': speakers[id]['speed'],
|
| 285 |
+
}
|
| 286 |
+
return styles
|
| 287 |
+
|
| 288 |
+
def generate(self, text, styles, stabilize=False, n_merge=14, default_speaker= "[id_1]"):
|
| 289 |
if stabilize: smooth_dur=0.2
|
| 290 |
else: smooth_dur=0
|
|
|
|
|
|
|
| 291 |
|
| 292 |
list_wav = []
|
| 293 |
prev_d_mean = 0
|
|
|
|
| 323 |
if bool(re.match(r'(\[id_\d+\])', i)):
|
| 324 |
#Set up env for matched speaker
|
| 325 |
speaker_id = i.strip('[]')
|
| 326 |
+
current_ref_s = styles[speaker_id]['style']
|
| 327 |
+
speed = styles[speaker_id]['speed']
|
| 328 |
continue
|
| 329 |
text_norm = self.preprocess.text_preprocess(i, n_merge=n_merge)
|
| 330 |
for sentence in text_norm:
|
|
|
|
| 339 |
print(e)
|
| 340 |
|
| 341 |
replacement_func = self.__init_replacement_func(cus_phonem)
|
| 342 |
+
phonem = espeak_phn(sentence, styles[speaker_id]['lang'])
|
| 343 |
phonem = re.sub(lang_pattern, replacement_func, phonem)
|
| 344 |
|
| 345 |
wav, prev_d_mean = self.__inference(phonem, current_ref_s, speed=speed, prev_d_mean=prev_d_mean, t=smooth_dur)
|
run.ipynb
CHANGED
|
The diff for this file is too large to render.
See raw diff
|
|
|