inoryQwQ commited on
Commit
6726998
·
verified ·
1 Parent(s): 1984c02

Fix g.bin path

Browse files
Files changed (1) hide show
  1. python/melotts.py +235 -235
python/melotts.py CHANGED
@@ -1,235 +1,235 @@
1
- import os
2
- os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
3
-
4
- import numpy as np
5
- import soundfile
6
- import onnxruntime as ort
7
- import axengine as axe
8
- import argparse
9
- import time
10
- from split_utils import split_sentence
11
- from text import cleaned_text_to_sequence
12
- from text.cleaner import clean_text
13
- from symbols import LANG_TO_SYMBOL_MAP
14
- import re
15
-
16
- def intersperse(lst, item):
17
- result = [item] * (len(lst) * 2 + 1)
18
- result[1::2] = lst
19
- return result
20
-
21
- def get_text_for_tts_infer(text, language_str, symbol_to_id=None):
22
- norm_text, phone, tone, word2ph = clean_text(text, language_str)
23
- phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str, symbol_to_id)
24
-
25
- phone = intersperse(phone, 0)
26
- tone = intersperse(tone, 0)
27
- language = intersperse(language, 0)
28
-
29
- phone = np.array(phone, dtype=np.int32)
30
- tone = np.array(tone, dtype=np.int32)
31
- language = np.array(language, dtype=np.int32)
32
- word2ph = np.array(word2ph, dtype=np.int32) * 2
33
- word2ph[0] += 1
34
-
35
- return phone, tone, language, norm_text, word2ph
36
-
37
- def split_sentences_into_pieces(text, language, quiet=False):
38
- texts = split_sentence(text, language_str=language)
39
- if not quiet:
40
- print(" > Text split to sentences.")
41
- print('\n'.join(texts))
42
- print(" > ===========================")
43
- return texts
44
-
45
- def get_args():
46
- parser = argparse.ArgumentParser(
47
- prog="melotts",
48
- description="Run TTS on input sentence"
49
- )
50
- parser.add_argument("--sentence", "-s", type=str, required=False, default="爱芯元智半导体股份有限公司,致力于打造世界领先的人工智能感知与边缘计算芯片。服务智慧城市、智能驾驶、机器人的海量普惠的应用")
51
- parser.add_argument("--wav", "-w", type=str, required=False, default="output.wav")
52
- parser.add_argument("--encoder", "-e", type=str, required=False, default=None)
53
- parser.add_argument("--decoder", "-d", type=str, required=False, default=None)
54
- parser.add_argument("--dec_len", type=int, default=128)
55
- parser.add_argument("--sample_rate", "-sr", type=int, required=False, default=44100)
56
- parser.add_argument("--speed", type=float, required=False, default=0.8)
57
- parser.add_argument("--language", "-l", type=str,
58
- choices=["ZH", "ZH_MIX_EN", "JP", "EN", 'KR', "ES", "SP","FR"], required=False, default="ZH_MIX_EN")
59
- return parser.parse_args()
60
-
61
-
62
- def audio_numpy_concat(segment_data_list, sr, speed=1.):
63
- audio_segments = []
64
- for segment_data in segment_data_list:
65
- audio_segments += segment_data.reshape(-1).tolist()
66
- audio_segments += [0] * int((sr * 0.05) / speed)
67
- audio_segments = np.array(audio_segments).astype(np.float32)
68
- return audio_segments
69
-
70
-
71
- def merge_sub_audio(sub_audio_list, pad_size, audio_len):
72
- # Average pad part
73
- if pad_size > 0:
74
- for i in range(len(sub_audio_list) - 1):
75
- sub_audio_list[i][-pad_size:] += sub_audio_list[i+1][:pad_size]
76
- sub_audio_list[i][-pad_size:] /= 2
77
- if i > 0:
78
- sub_audio_list[i] = sub_audio_list[i][pad_size:]
79
-
80
- sub_audio = np.concatenate(sub_audio_list, axis=-1)
81
- return sub_audio[:audio_len]
82
-
83
- # 计算每个词的发音时长
84
- def calc_word2pronoun(word2ph, pronoun_lens):
85
- indice = [0]
86
- for ph in word2ph[:-1]:
87
- indice.append(indice[-1] + ph)
88
- word2pronoun = []
89
- for i, ph in zip(indice, word2ph):
90
- word2pronoun.append(np.sum(pronoun_lens[i : i + ph]))
91
- return word2pronoun
92
-
93
- # 生成有overlap的slice,slice索引是对于zp的
94
- def generate_slices(word2pronoun, dec_len):
95
- pn_start, pn_end = 0, 0
96
- zp_start, zp_end = 0, 0
97
- zp_len = 0
98
- pn_slices = []
99
- zp_slices = []
100
- while pn_end < len(word2pronoun):
101
- # 前一个slice长度大于2 且 加上现在这个字没有超过dec_len,则往前overlap两个字
102
- if pn_end - pn_start > 2 and np.sum(word2pronoun[pn_end - 2 : pn_end + 1]) <= dec_len:
103
- zp_len = np.sum(word2pronoun[pn_end - 2 : pn_end])
104
- zp_start = zp_end - zp_len
105
- pn_start = pn_end - 2
106
- else:
107
- zp_len = 0
108
- zp_start = zp_end
109
- pn_start = pn_end
110
-
111
- while pn_end < len(word2pronoun) and zp_len + word2pronoun[pn_end] <= dec_len:
112
- zp_len += word2pronoun[pn_end]
113
- pn_end += 1
114
- zp_end = zp_start + zp_len
115
- pn_slices.append(slice(pn_start, pn_end))
116
- zp_slices.append(slice(zp_start, zp_end))
117
- return pn_slices, zp_slices
118
-
119
- def main():
120
- args = get_args()
121
- sentence = args.sentence
122
- sample_rate = args.sample_rate
123
- enc_model = args.encoder # default="../models/encoder.onnx"
124
- dec_model = args.decoder # default="../models/decoder.axmodel"
125
- language = args.language # default: ZH_MIX_EN
126
- dec_len = args.dec_len # default: 128
127
-
128
- if language == "ZH":
129
- language = "ZH_MIX_EN"
130
-
131
- if enc_model is None:
132
- if "ZH" in language:
133
- enc_model = "../models/encoder-zh.onnx"
134
- else:
135
- enc_model = f"../models/encoder-{language.lower()}.onnx"
136
- assert os.path.exists(enc_model), f"Encoder model ({enc_model}) not exist!"
137
- if dec_model is None:
138
- if "ZH" in language:
139
- dec_model = "../models/decoder-zh.axmodel"
140
- else:
141
- dec_model = f"../models/decoder-{language.lower()}.axmodel"
142
- assert os.path.exists(dec_model), f"Decoder model ({dec_model}) not exist!"
143
-
144
- print(f"sentence: {sentence}")
145
- print(f"sample_rate: {sample_rate}")
146
- print(f"encoder: {enc_model}")
147
- print(f"decoder: {dec_model}")
148
- print(f"language: {language}")
149
-
150
- _symbol_to_id = {s: i for i, s in enumerate(LANG_TO_SYMBOL_MAP[language])}
151
-
152
- # Split sentence
153
- start = time.time()
154
- sens = split_sentences_into_pieces(sentence, language, quiet=False)
155
- print(f"split_sentences_into_pieces take {1000 * (time.time() - start)}ms")
156
-
157
- # Load models
158
- start = time.time()
159
- sess_enc = ort.InferenceSession(enc_model, providers=["CPUExecutionProvider"], sess_options=ort.SessionOptions())
160
- sess_dec = axe.InferenceSession(dec_model)
161
- print(f"load models take {1000 * (time.time() - start)}ms")
162
-
163
- # Load static input
164
- g = np.fromfile(f"../models/g-{language.lower()}.bin", dtype=np.float32).reshape(1, 256, 1)
165
-
166
- # Final wav
167
- audio_list = []
168
-
169
- # Iterate over splitted sentences
170
- for n, se in enumerate(sens):
171
- if language in ['EN', 'ZH_MIX_EN']:
172
- se = re.sub(r'([a-z])([A-Z])', r'\1 \2', se)
173
- print(f"\nSentence[{n}]: {se}")
174
- # Convert sentence to phones and tones
175
- phones, tones, lang_ids, norm_text, word2ph = get_text_for_tts_infer(se, language, symbol_to_id=_symbol_to_id)
176
-
177
- start = time.time()
178
- # Run encoder
179
- z_p, pronoun_lens, audio_len = sess_enc.run(None, input_feed={
180
- 'phone': phones, 'g': g,
181
- 'tone': tones, 'language': lang_ids,
182
- 'noise_scale': np.array([0], dtype=np.float32),
183
- 'length_scale': np.array([1.0 / args.speed], dtype=np.float32),
184
- 'noise_scale_w': np.array([0], dtype=np.float32),
185
- 'sdp_ratio': np.array([0], dtype=np.float32)})
186
- print(f"encoder run take {1000 * (time.time() - start):.2f}ms")
187
-
188
- # 计算每个词的发音长度
189
- word2pronoun = calc_word2pronoun(word2ph, pronoun_lens)
190
- # 生成word2pronoun和zp的切片
191
- pn_slices, zp_slices = generate_slices(word2pronoun, dec_len)
192
-
193
- audio_len = audio_len[0]
194
- sub_audio_list = []
195
- for i, (ps, zs) in enumerate(zip(pn_slices, zp_slices)):
196
- zp_slice = z_p[..., zs]
197
-
198
- # Padding前zp的长度
199
- sub_dec_len = zp_slice.shape[-1]
200
- # Padding前输出音频的长度
201
- sub_audio_len = 512 * sub_dec_len
202
-
203
- # Padding到dec_len
204
- if zp_slice.shape[-1] < dec_len:
205
- zp_slice = np.concatenate((zp_slice, np.zeros((*zp_slice.shape[:-1], dec_len - zp_slice.shape[-1]), dtype=np.float32)), axis=-1)
206
-
207
- start = time.time()
208
- audio = sess_dec.run(None, input_feed={"z_p": zp_slice,
209
- "g": g
210
- })[0].flatten()
211
-
212
- # 处理overlap
213
- audio_start = 0
214
- if len(sub_audio_list) > 0:
215
- if pn_slices[i - 1].stop > ps.start:
216
- # 去掉第一个字
217
- audio_start = 512 * word2pronoun[ps.start]
218
-
219
- audio_end = sub_audio_len
220
- if i < len(pn_slices) - 1:
221
- if ps.stop > pn_slices[i + 1].start:
222
- # 去掉最后一个字
223
- audio_end = sub_audio_len - 512 * word2pronoun[ps.stop - 1]
224
-
225
- audio = audio[audio_start:audio_end]
226
- print(f"Decode slice[{i}]: decoder run take {1000 * (time.time() - start):.2f}ms")
227
- sub_audio_list.append(audio)
228
- sub_audio = merge_sub_audio(sub_audio_list, 0, audio_len)
229
- audio_list.append(sub_audio)
230
- audio = audio_numpy_concat(audio_list, sr=sample_rate, speed=args.speed)
231
- soundfile.write(args.wav, audio, sample_rate)
232
- print(f"Save to {args.wav}")
233
-
234
- if __name__ == "__main__":
235
- main()
 
1
+ import os
2
+ os.environ["HF_ENDPOINT"] = "https://hf-mirror.com"
3
+
4
+ import numpy as np
5
+ import soundfile
6
+ import onnxruntime as ort
7
+ import axengine as axe
8
+ import argparse
9
+ import time
10
+ from split_utils import split_sentence
11
+ from text import cleaned_text_to_sequence
12
+ from text.cleaner import clean_text
13
+ from symbols import LANG_TO_SYMBOL_MAP
14
+ import re
15
+
16
+ def intersperse(lst, item):
17
+ result = [item] * (len(lst) * 2 + 1)
18
+ result[1::2] = lst
19
+ return result
20
+
21
+ def get_text_for_tts_infer(text, language_str, symbol_to_id=None):
22
+ norm_text, phone, tone, word2ph = clean_text(text, language_str)
23
+ phone, tone, language = cleaned_text_to_sequence(phone, tone, language_str, symbol_to_id)
24
+
25
+ phone = intersperse(phone, 0)
26
+ tone = intersperse(tone, 0)
27
+ language = intersperse(language, 0)
28
+
29
+ phone = np.array(phone, dtype=np.int32)
30
+ tone = np.array(tone, dtype=np.int32)
31
+ language = np.array(language, dtype=np.int32)
32
+ word2ph = np.array(word2ph, dtype=np.int32) * 2
33
+ word2ph[0] += 1
34
+
35
+ return phone, tone, language, norm_text, word2ph
36
+
37
+ def split_sentences_into_pieces(text, language, quiet=False):
38
+ texts = split_sentence(text, language_str=language)
39
+ if not quiet:
40
+ print(" > Text split to sentences.")
41
+ print('\n'.join(texts))
42
+ print(" > ===========================")
43
+ return texts
44
+
45
+ def get_args():
46
+ parser = argparse.ArgumentParser(
47
+ prog="melotts",
48
+ description="Run TTS on input sentence"
49
+ )
50
+ parser.add_argument("--sentence", "-s", type=str, required=False, default="爱芯元智半导体股份有限公司,致力于打造世界领先的人工智能感知与边缘计算芯片。服务智慧城市、智能驾驶、机器人的海量普惠的应用")
51
+ parser.add_argument("--wav", "-w", type=str, required=False, default="output.wav")
52
+ parser.add_argument("--encoder", "-e", type=str, required=False, default=None)
53
+ parser.add_argument("--decoder", "-d", type=str, required=False, default=None)
54
+ parser.add_argument("--dec_len", type=int, default=128)
55
+ parser.add_argument("--sample_rate", "-sr", type=int, required=False, default=44100)
56
+ parser.add_argument("--speed", type=float, required=False, default=0.8)
57
+ parser.add_argument("--language", "-l", type=str,
58
+ choices=["ZH", "ZH_MIX_EN", "JP", "EN", 'KR', "ES", "SP","FR"], required=False, default="ZH_MIX_EN")
59
+ return parser.parse_args()
60
+
61
+
62
+ def audio_numpy_concat(segment_data_list, sr, speed=1.):
63
+ audio_segments = []
64
+ for segment_data in segment_data_list:
65
+ audio_segments += segment_data.reshape(-1).tolist()
66
+ audio_segments += [0] * int((sr * 0.05) / speed)
67
+ audio_segments = np.array(audio_segments).astype(np.float32)
68
+ return audio_segments
69
+
70
+
71
+ def merge_sub_audio(sub_audio_list, pad_size, audio_len):
72
+ # Average pad part
73
+ if pad_size > 0:
74
+ for i in range(len(sub_audio_list) - 1):
75
+ sub_audio_list[i][-pad_size:] += sub_audio_list[i+1][:pad_size]
76
+ sub_audio_list[i][-pad_size:] /= 2
77
+ if i > 0:
78
+ sub_audio_list[i] = sub_audio_list[i][pad_size:]
79
+
80
+ sub_audio = np.concatenate(sub_audio_list, axis=-1)
81
+ return sub_audio[:audio_len]
82
+
83
+ # 计算每个词的发音时长
84
+ def calc_word2pronoun(word2ph, pronoun_lens):
85
+ indice = [0]
86
+ for ph in word2ph[:-1]:
87
+ indice.append(indice[-1] + ph)
88
+ word2pronoun = []
89
+ for i, ph in zip(indice, word2ph):
90
+ word2pronoun.append(np.sum(pronoun_lens[i : i + ph]))
91
+ return word2pronoun
92
+
93
+ # 生成有overlap的slice,slice索引是对于zp的
94
+ def generate_slices(word2pronoun, dec_len):
95
+ pn_start, pn_end = 0, 0
96
+ zp_start, zp_end = 0, 0
97
+ zp_len = 0
98
+ pn_slices = []
99
+ zp_slices = []
100
+ while pn_end < len(word2pronoun):
101
+ # 前一个slice长度大于2 且 加上现在这个字没有超过dec_len,则往前overlap两个字
102
+ if pn_end - pn_start > 2 and np.sum(word2pronoun[pn_end - 2 : pn_end + 1]) <= dec_len:
103
+ zp_len = np.sum(word2pronoun[pn_end - 2 : pn_end])
104
+ zp_start = zp_end - zp_len
105
+ pn_start = pn_end - 2
106
+ else:
107
+ zp_len = 0
108
+ zp_start = zp_end
109
+ pn_start = pn_end
110
+
111
+ while pn_end < len(word2pronoun) and zp_len + word2pronoun[pn_end] <= dec_len:
112
+ zp_len += word2pronoun[pn_end]
113
+ pn_end += 1
114
+ zp_end = zp_start + zp_len
115
+ pn_slices.append(slice(pn_start, pn_end))
116
+ zp_slices.append(slice(zp_start, zp_end))
117
+ return pn_slices, zp_slices
118
+
119
+ def main():
120
+ args = get_args()
121
+ sentence = args.sentence
122
+ sample_rate = args.sample_rate
123
+ enc_model = args.encoder # default="../models/encoder.onnx"
124
+ dec_model = args.decoder # default="../models/decoder.axmodel"
125
+ language = args.language # default: ZH_MIX_EN
126
+ dec_len = args.dec_len # default: 128
127
+
128
+ if language == "ZH":
129
+ language = "ZH_MIX_EN"
130
+
131
+ if enc_model is None:
132
+ if "ZH" in language:
133
+ enc_model = "../models/encoder-zh.onnx"
134
+ else:
135
+ enc_model = f"../models/encoder-{language.lower()}.onnx"
136
+ assert os.path.exists(enc_model), f"Encoder model ({enc_model}) not exist!"
137
+ if dec_model is None:
138
+ if "ZH" in language:
139
+ dec_model = "../models/decoder-zh.axmodel"
140
+ else:
141
+ dec_model = f"../models/decoder-{language.lower()}.axmodel"
142
+ assert os.path.exists(dec_model), f"Decoder model ({dec_model}) not exist!"
143
+
144
+ print(f"sentence: {sentence}")
145
+ print(f"sample_rate: {sample_rate}")
146
+ print(f"encoder: {enc_model}")
147
+ print(f"decoder: {dec_model}")
148
+ print(f"language: {language}")
149
+
150
+ _symbol_to_id = {s: i for i, s in enumerate(LANG_TO_SYMBOL_MAP[language])}
151
+
152
+ # Split sentence
153
+ start = time.time()
154
+ sens = split_sentences_into_pieces(sentence, language, quiet=False)
155
+ print(f"split_sentences_into_pieces take {1000 * (time.time() - start)}ms")
156
+
157
+ # Load models
158
+ start = time.time()
159
+ sess_enc = ort.InferenceSession(enc_model, providers=["CPUExecutionProvider"], sess_options=ort.SessionOptions())
160
+ sess_dec = axe.InferenceSession(dec_model)
161
+ print(f"load models take {1000 * (time.time() - start)}ms")
162
+
163
+ # Load static input
164
+ g = np.fromfile(f"../g-{language.lower()}.bin", dtype=np.float32).reshape(1, 256, 1)
165
+
166
+ # Final wav
167
+ audio_list = []
168
+
169
+ # Iterate over splitted sentences
170
+ for n, se in enumerate(sens):
171
+ if language in ['EN', 'ZH_MIX_EN']:
172
+ se = re.sub(r'([a-z])([A-Z])', r'\1 \2', se)
173
+ print(f"\nSentence[{n}]: {se}")
174
+ # Convert sentence to phones and tones
175
+ phones, tones, lang_ids, norm_text, word2ph = get_text_for_tts_infer(se, language, symbol_to_id=_symbol_to_id)
176
+
177
+ start = time.time()
178
+ # Run encoder
179
+ z_p, pronoun_lens, audio_len = sess_enc.run(None, input_feed={
180
+ 'phone': phones, 'g': g,
181
+ 'tone': tones, 'language': lang_ids,
182
+ 'noise_scale': np.array([0], dtype=np.float32),
183
+ 'length_scale': np.array([1.0 / args.speed], dtype=np.float32),
184
+ 'noise_scale_w': np.array([0], dtype=np.float32),
185
+ 'sdp_ratio': np.array([0], dtype=np.float32)})
186
+ print(f"encoder run take {1000 * (time.time() - start):.2f}ms")
187
+
188
+ # 计算每个词的发音长度
189
+ word2pronoun = calc_word2pronoun(word2ph, pronoun_lens)
190
+ # 生成word2pronoun和zp的切片
191
+ pn_slices, zp_slices = generate_slices(word2pronoun, dec_len)
192
+
193
+ audio_len = audio_len[0]
194
+ sub_audio_list = []
195
+ for i, (ps, zs) in enumerate(zip(pn_slices, zp_slices)):
196
+ zp_slice = z_p[..., zs]
197
+
198
+ # Padding前zp的长度
199
+ sub_dec_len = zp_slice.shape[-1]
200
+ # Padding前输出音频的长度
201
+ sub_audio_len = 512 * sub_dec_len
202
+
203
+ # Padding到dec_len
204
+ if zp_slice.shape[-1] < dec_len:
205
+ zp_slice = np.concatenate((zp_slice, np.zeros((*zp_slice.shape[:-1], dec_len - zp_slice.shape[-1]), dtype=np.float32)), axis=-1)
206
+
207
+ start = time.time()
208
+ audio = sess_dec.run(None, input_feed={"z_p": zp_slice,
209
+ "g": g
210
+ })[0].flatten()
211
+
212
+ # 处理overlap
213
+ audio_start = 0
214
+ if len(sub_audio_list) > 0:
215
+ if pn_slices[i - 1].stop > ps.start:
216
+ # 去掉第一个字
217
+ audio_start = 512 * word2pronoun[ps.start]
218
+
219
+ audio_end = sub_audio_len
220
+ if i < len(pn_slices) - 1:
221
+ if ps.stop > pn_slices[i + 1].start:
222
+ # 去掉最后一个字
223
+ audio_end = sub_audio_len - 512 * word2pronoun[ps.stop - 1]
224
+
225
+ audio = audio[audio_start:audio_end]
226
+ print(f"Decode slice[{i}]: decoder run take {1000 * (time.time() - start):.2f}ms")
227
+ sub_audio_list.append(audio)
228
+ sub_audio = merge_sub_audio(sub_audio_list, 0, audio_len)
229
+ audio_list.append(sub_audio)
230
+ audio = audio_numpy_concat(audio_list, sr=sample_rate, speed=args.speed)
231
+ soundfile.write(args.wav, audio, sample_rate)
232
+ print(f"Save to {args.wav}")
233
+
234
+ if __name__ == "__main__":
235
+ main()