niobures commited on
Commit
b915266
·
verified ·
1 Parent(s): 07d6458

HiFi-GAN (models_onnx: ailia-models)

Browse files
.gitattributes CHANGED
@@ -51,3 +51,5 @@ hifigan_for_sherpa/pretrained/VCTK_V3/generator_v3 filter=lfs diff=lfs merge=lfs
51
  tts_fa_fastpitch_hifigan-v2.0/models/FastPitch--val_loss-0.7236-epoch-50.nemo filter=lfs diff=lfs merge=lfs -text
52
  tts_fa_fastpitch_hifigan-v2.0/models/HifiGan--val_loss-0.6090-epoch-39-last.nemo filter=lfs diff=lfs merge=lfs -text
53
  tts_fa_fastpitch_hifigan-v2.0/models/HifiGan--val_loss-2.0733-epoch-12-last.nemo filter=lfs diff=lfs merge=lfs -text
 
 
 
51
  tts_fa_fastpitch_hifigan-v2.0/models/FastPitch--val_loss-0.7236-epoch-50.nemo filter=lfs diff=lfs merge=lfs -text
52
  tts_fa_fastpitch_hifigan-v2.0/models/HifiGan--val_loss-0.6090-epoch-39-last.nemo filter=lfs diff=lfs merge=lfs -text
53
  tts_fa_fastpitch_hifigan-v2.0/models/HifiGan--val_loss-2.0733-epoch-12-last.nemo filter=lfs diff=lfs merge=lfs -text
54
+ ailia-models/code/tests/test1.wav filter=lfs diff=lfs merge=lfs -text
55
+ ailia-models/code/tests/test2.wav filter=lfs diff=lfs merge=lfs -text
ailia-models/code/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) 2020 Jungil Kong
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE.
ailia-models/code/README.md ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # HiFi-GAN: Generative Adversarial Networks for Efficient and High Fidelity Speech Synthesis.
2
+
3
+ ### Input
4
+ A wav file or npy file containng mel spectograms.
5
+
6
+ ### Output
7
+ The Voice file is output as .wav which path is defined as `SAVE_WAV_PATH` in `hifigan.py`.
8
+
9
+ ### Usage
10
+ Automatically downloads the onnx and prototxt files on the first run. It is necessary to be connected to the Internet while downloading.
11
+
12
+ For the sample file use following command. It uses numpy file by defoult.
13
+ ```
14
+ pip3 install -r requirements.txt
15
+
16
+ python3 hifigan.py
17
+ ```
18
+
19
+ You can change type of the sample input to wav file by using following command:
20
+
21
+ ```
22
+ python3 hifigan.py --inputType wav
23
+ ```
24
+
25
+ If you want to specify the input file, put the wav or numpy file path after the --input option.
26
+ You can use --savepath option to change the name of the output file to save. No need to specify input type.
27
+
28
+ ```
29
+ python3 hifigan.py --input test.wav --savepath SAVE_WAV_PATH
30
+ ```
31
+
32
+
33
+ ### Framework
34
+ PyTorch
35
+
36
+ ### Model Format
37
+ ONNX opset = 11
38
+
39
+ ### Netron
40
+
41
+ [HIFI GAN model](LICENSE_HIFI)
42
+
43
+ - [generator_dynamic.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/hifigan/generator_dynamic.onnx.prototxt)
ailia-models/code/hifigan.py ADDED
@@ -0,0 +1,253 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import sys
3
+ import argparse
4
+ import re
5
+
6
+ import numpy as np
7
+ import soundfile as sf
8
+ import librosa
9
+ import librosa.filters
10
+ from scipy.io.wavfile import write, read
11
+ from librosa.util import normalize
12
+ from librosa.filters import mel as librosa_mel_fn
13
+
14
+
15
+
16
+ import ailia # noqa: E402
17
+
18
+ # import original modules
19
+ sys.path.append('../../util')
20
+ from arg_utils import get_base_parser, update_parser, get_savepath # noqa: E402
21
+ from model_utils import check_and_download_models # noqa: E402
22
+
23
+ # logger
24
+ from logging import getLogger # noqa: E402
25
+ logger = getLogger(__name__)
26
+
27
+ # ======================
28
+ # PARAMETERS
29
+ # ======================
30
+
31
+ SAVE_WAV_PATH = 'output.wav'
32
+ INPUT_WAV_PATH="tests/test2.wav"
33
+ INPUT_NP_PATH="tests/test.npy"
34
+ REMOTE_PATH = "https://storage.googleapis.com/ailia-models/hifigan/"
35
+
36
+
37
+ # ======================
38
+ # Arguemnt Parser Config
39
+ # ======================
40
+
41
+ parser = get_base_parser( 'HIFI GAN', INPUT_WAV_PATH, SAVE_WAV_PATH)
42
+ # overwrite
43
+ parser.add_argument(
44
+ '--input', '-i', metavar='TEXT', default=None,
45
+ help='input text'
46
+ )
47
+ parser.add_argument(
48
+ '--onnx', action='store_true',
49
+ help='use onnx runtime'
50
+ )
51
+ parser.add_argument(
52
+ '--inputType', default="numpy",
53
+ help='[nupmy, wav]'
54
+ )
55
+ parser.add_argument(
56
+ '-m', '--model',
57
+ default='hifi',
58
+ help='[hifi]'
59
+ )
60
+ parser.add_argument(
61
+ '--profile', action='store_true',
62
+ help='use profile model'
63
+ )
64
+ args = update_parser(parser, check_input_type=False)
65
+
66
+ if args.model == "hifi":
67
+
68
+ WEIGHT_PATH_hifi = 'generator_dynamic.onnx'
69
+ else:
70
+ logger.error("unknown model")
71
+ sys.exit()
72
+
73
+
74
+
75
+ MODEL_PATH_hifi = WEIGHT_PATH_hifi+'.prototxt'
76
+
77
+
78
+
79
+
80
+
81
+
82
+
83
+ # ======================
84
+ # Parameters
85
+ # ======================
86
+
87
+ if args.onnx:
88
+ import onnxruntime
89
+ else:
90
+ import ailia
91
+
92
+ if args.input:
93
+
94
+ text = args.input[0]
95
+
96
+
97
+ else:
98
+
99
+ if args.inputType != "numpy":
100
+ text = INPUT_WAV_PATH
101
+ else:
102
+ text = INPUT_NP_PATH
103
+
104
+ #Parameters reqired to create mell spectograms
105
+ sampling_rate = 22050
106
+ segment_size = 8192
107
+ num_mels = 80
108
+ num_freq = 1025
109
+ n_fft = 1024
110
+ hop_size = 256
111
+ win_size = 1024
112
+
113
+ fmin = 0
114
+ fmax = 8000
115
+ MAX_WAV_VALUE = 32768.0
116
+
117
+
118
+
119
+ # ======================
120
+ # Functions
121
+ # ======================
122
+
123
+
124
+ def load_wav(full_path):
125
+ sr, data = read(full_path)
126
+
127
+ #Convertion of stereo to mono
128
+ if data.ndim==2:
129
+ data = np.mean(data, axis=1, dtype=data.dtype)
130
+ return data, sr
131
+
132
+ def dynamic_range_compression(x, C=1, clip_val=1e-5):
133
+ return np.log(np.clip(x, clip_val, None) * C)
134
+
135
+
136
+ def mel_spectrogram(y, n_fft, num_mels, sr, hop_size, win_size, fmin, fmax, center=False):
137
+
138
+ if np.min(y) < -1.:
139
+ print('min value is ', np.min(y))
140
+ if np.max(y) > 1.:
141
+ print('max value is ', np.max(y))
142
+ mel_basis = {}
143
+ hann_window = {}
144
+ if fmax not in mel_basis:
145
+ mel_X = librosa_mel_fn(sr=sr, n_fft=n_fft, n_mels=num_mels, fmin=fmin, fmax=fmax)
146
+
147
+ mel_basis[str(fmax)] = mel_X
148
+ hann_window[str(1)] = np.hanning(win_size)
149
+ pad_size = (int((n_fft - hop_size) / 2), int((n_fft - hop_size) / 2))
150
+ y = np.pad(y, ((0, 0), pad_size), mode='reflect')
151
+
152
+ y=np.squeeze(y, axis=0)
153
+ spec = librosa.stft(y, n_fft=n_fft, hop_length=hop_size, win_length=win_size, window=hann_window["1"],
154
+ center=False, pad_mode='reflect')
155
+ spec = np.abs(spec)
156
+ spec = np.dot( mel_basis[str(fmax)], spec,)
157
+ spec = dynamic_range_compression(spec)
158
+ return np.expand_dims(spec, 0)
159
+
160
+
161
+ def get_mel(PATH_TO_MELL):
162
+ return np.load(PATH_TO_MELL)
163
+
164
+ def check_input_type():
165
+
166
+ if args.input:
167
+
168
+ if args.input[0].split(".")[1] in ["npy", "wav"]:
169
+ if args.input[0].split(".")[1] == "npy":
170
+ return "numpy"
171
+ else:
172
+ return"wav"
173
+ else:
174
+ return print("Unsupported input")
175
+ else:
176
+ return args.inputType
177
+
178
+
179
+
180
+
181
+
182
+ def generate_voice(hifi):
183
+ # onnx
184
+ sampling_Rate=sampling_rate
185
+ inputTypes=check_input_type()
186
+
187
+
188
+ if inputTypes != "numpy":
189
+ wav, sr =load_wav(text)
190
+ wav = wav / MAX_WAV_VALUE
191
+
192
+ wav=np.expand_dims(wav, axis=0)
193
+ mel_outputs = mel_spectrogram(wav, n_fft, num_mels, sampling_rate, hop_size, win_size, fmin, fmax, center=False)
194
+ sampling_Rate = sr
195
+ else:
196
+
197
+ mel_outputs=get_mel(text)
198
+
199
+
200
+
201
+ if args.benchmark:
202
+ start = int(round(time.time() * 1000))
203
+
204
+ if args.onnx:
205
+ hifi_inputs = {hifi.get_inputs()[0].name: mel_outputs.astype(np.float32)}
206
+ audio = hifi.run(None, hifi_inputs)[0]
207
+ else:
208
+ hifi.set_input_shape((1,80,mel_outputs.shape[2]))
209
+ hifi_inputs = [mel_outputs]
210
+ audio = hifi.run( hifi_inputs)[0]
211
+
212
+ if args.benchmark:
213
+ end = int(round(time.time() * 1000))
214
+ estimation_time = (end - start)
215
+ logger.info(f'\t Hifi processing time {estimation_time} ms')
216
+
217
+
218
+ savepath = args.savepath
219
+ logger.info(f'saved at : {savepath}')
220
+ audio = audio.squeeze()
221
+
222
+ audio = audio * MAX_WAV_VALUE
223
+ audio = audio.astype('int16')
224
+ sf.write(savepath, audio, sampling_Rate)
225
+ logger.info('Script finished successfully.')
226
+
227
+
228
+
229
+ def main():
230
+ # model files check and download
231
+
232
+ check_and_download_models(WEIGHT_PATH_hifi, MODEL_PATH_hifi, REMOTE_PATH)
233
+
234
+ #env_id = args.env_id
235
+
236
+ if args.onnx:
237
+
238
+ hifi = onnxruntime.InferenceSession(WEIGHT_PATH_hifi)
239
+ else:
240
+ memory_mode = ailia.get_memory_mode(reduce_constant=True, ignore_input_with_initializer=True, reduce_interstage=False, reuse_interstage=True)
241
+ hifi = ailia.Net(stream = MODEL_PATH_hifi, weight = WEIGHT_PATH_hifi, memory_mode = memory_mode, env_id = args.env_id)
242
+ if args.profile:
243
+ hifi.set_profile_mode(True)
244
+
245
+ generate_voice( hifi)
246
+
247
+ if args.profile:
248
+
249
+ print("HIFI GAN : ")
250
+ print(hifi.get_summary())
251
+
252
+ if __name__ == '__main__':
253
+ main()
ailia-models/code/requirments.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ soundfile
2
+ librosa
ailia-models/code/tests/test.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2aaec0d65eb65c11f667cc150667dc6966fdd154242c32413cd956de63a90825
3
+ size 188928
ailia-models/code/tests/test1.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6906f9861ef1ac1c62a0d240ec626a1bb5e338abb094c8179c1d28c15db80fa3
3
+ size 2394322
ailia-models/code/tests/test2.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4d2bd83ff0f7fe33491b03193abdf623f1b4cc2d8103972b33638d3607d86dc5
3
+ size 425830
ailia-models/generator_dynamic.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:68f2506eee79192c8d5366e196a4c18397ad51973834b80983fdfbc1cefd6f77
3
+ size 56087745
ailia-models/generator_dynamic.onnx.prototxt ADDED
The diff for this file is too large to render. See raw diff
 
ailia-models/source.txt ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ https://github.com/axinc-ai/ailia-models/tree/master/audio_processing/hifigan
2
+
3
+ https://storage.googleapis.com/ailia-models/hifigan/generator_dynamic.onnx
4
+ https://storage.googleapis.com/ailia-models/hifigan/generator_dynamic.onnx.prototxt