niobures commited on
Commit
815e31f
·
verified ·
1 Parent(s): 81ceaa5

U-Net (models_onnx)

Browse files
.gitattributes CHANGED
@@ -35,3 +35,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  Funnel[[:space:]]Deep[[:space:]]Complex[[:space:]]U-Net[[:space:]]for[[:space:]]Phase-Aware[[:space:]]Speech[[:space:]]Enhancement.pdf filter=lfs diff=lfs merge=lfs -text
37
  Phase-aware[[:space:]]Speech[[:space:]]Enhancement[[:space:]]with[[:space:]]Deep[[:space:]]Complex[[:space:]]U-Net.pdf filter=lfs diff=lfs merge=lfs -text
 
 
 
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
  Funnel[[:space:]]Deep[[:space:]]Complex[[:space:]]U-Net[[:space:]]for[[:space:]]Phase-Aware[[:space:]]Speech[[:space:]]Enhancement.pdf filter=lfs diff=lfs merge=lfs -text
37
  Phase-aware[[:space:]]Speech[[:space:]]Enhancement[[:space:]]with[[:space:]]Deep[[:space:]]Complex[[:space:]]U-Net.pdf filter=lfs diff=lfs merge=lfs -text
38
+ models/ailia-models/code/049[[:space:]]-[[:space:]]Young[[:space:]]Griffo[[:space:]]-[[:space:]]Facade.wav filter=lfs diff=lfs merge=lfs -text
39
+ models/ailia-models/code/doublenoble_k7rain_part.wav filter=lfs diff=lfs merge=lfs -text
models/ailia-models/RefineSpectrogramUnet.best.opt.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6ab1e0af3c22250f626379ee4a367687a28547fe3aa186b6a614e1b9dee3b3da
3
+ size 381668080
models/ailia-models/RefineSpectrogramUnet.best.opt.onnx.prototxt ADDED
The diff for this file is too large to render. See raw diff
 
models/ailia-models/code/049 - Young Griffo - Facade.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d9701789b5d5d6dd89d82cbc146fc70edc127cbec5176be7816079cd06225c91
3
+ size 6867808
models/ailia-models/code/LICENSE ADDED
@@ -0,0 +1,25 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ BSD 2-Clause License
2
+
3
+ Copyright (c) 2019, ILJI CHOI
4
+ All rights reserved.
5
+
6
+ Redistribution and use in source and binary forms, with or without
7
+ modification, are permitted provided that the following conditions are met:
8
+
9
+ 1. Redistributions of source code must retain the above copyright notice, this
10
+ list of conditions and the following disclaimer.
11
+
12
+ 2. Redistributions in binary form must reproduce the above copyright notice,
13
+ this list of conditions and the following disclaimer in the documentation
14
+ and/or other materials provided with the distribution.
15
+
16
+ THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
17
+ AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18
+ IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
19
+ DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
20
+ FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21
+ DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
22
+ SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
23
+ CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
24
+ OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
25
+ OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
models/ailia-models/code/README.md ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # source_separation
2
+
3
+ ### input
4
+
5
+ - Noisy speech (audio file)
6
+
7
+ ```
8
+ Audio from creative commons youtube videos
9
+ https://drive.google.com/drive/folders/19Sn6pe5-BtWXYa6OiLbYGH7iCU-mzB8j
10
+ doublenoble_k7rain_part.wav
11
+ (Original video : https://www.youtube.com/watch?v=vsjB1xTwZ20&t=536s)
12
+ ```
13
+
14
+ - Music (audio file)
15
+ ```
16
+ DSD100 dataset
17
+ https://sigsep.github.io/datasets/dsd100.html
18
+ 049 - Young Griffo - Facade.wav
19
+ ```
20
+
21
+ ### output
22
+
23
+ Separated voice (audio file)
24
+ ```
25
+ separated_voice.wav
26
+ ```
27
+
28
+ ### Usage
29
+ Automatically downloads the onnx and prototxt files on the first run. It is necessary to be connected to the Internet while downloading.
30
+
31
+ For the sample audio file,
32
+ ```bash
33
+ $ python3 unet_source_separation.py
34
+
35
+ ```
36
+
37
+ If you want to specify the input audio file, put the input path after the --input option.
38
+ You can use --savepath option to change the name of the output file to save.
39
+ ```bash
40
+ $ python3 unet_source_separation.py --input WAV_PATH --savepath SAVE_WAV_PATH
41
+ ```
42
+
43
+ You can select a pretrained model by specifying --arch base (default) or --arch large.
44
+ `base` is a model for general voice separation task, and `large` is a model for singing voice separation task.
45
+ ```bash
46
+ $ python3 unet_source_separation.py --input WAV_PATH --savepath SAVE_WAV_PATH --arch base
47
+ ```
48
+
49
+
50
+ ### Reference
51
+
52
+ [source_separation](https://github.com/AppleHolic/source_separation)
53
+
54
+ [Singing Voice Separation Samples](https://www.youtube.com/playlist?list=PLQ4ukFz6Ieir5bZYOns08_2gMjt4hYP4I)
55
+
56
+ ### Framework
57
+
58
+ PyTorch 1.6.0
59
+
60
+ ### Model Format
61
+
62
+ ONNX opset = 11
63
+
64
+ ### Netron
65
+ - General voice separation
66
+
67
+ [second_voice_bank.best.opt.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/unet_source_separation/second_voice_bank.best.opt.onnx.prototxt)
68
+
69
+ - Singing voice separation
70
+
71
+ [RefineSpectrogramUnet.best.opt.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/unet_source_separation/RefineSpectrogramUnet.best.opt.onnx.prototxt)
models/ailia-models/code/doublenoble_k7rain_part.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac658a1284aa28ea7c77f5126691ef02696fa1bfa41a0a5b41cd9906260bf8dd
3
+ size 11518066
models/ailia-models/code/requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ numpy==1.22.0
2
+ soundfile==0.10.3.post1
3
+ scipy==1.10.0
models/ailia-models/code/unet_source_separation.py ADDED
@@ -0,0 +1,200 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import sys
3
+ import argparse
4
+
5
+ import numpy as np
6
+
7
+ import ailia # noqa: E402
8
+
9
+ import soundfile as sf
10
+
11
+ # import original modules
12
+ sys.path.append('../../util')
13
+ from arg_utils import get_base_parser, update_parser, get_savepath # noqa: E402
14
+ from model_utils import check_and_download_models # noqa: E402
15
+
16
+ # logger
17
+ from logging import getLogger # noqa: E402
18
+ logger = getLogger(__name__)
19
+
20
+
21
+ # ======================
22
+ # Parameters 1
23
+ # ======================
24
+ WAV_PATH = 'doublenoble_k7rain_part.wav' # noisy speech sample
25
+ #WAVE_PATH = '049 - Young Griffo - Facade.wav' # music sample
26
+ SAVE_WAV_PATH = 'separated_voice.wav'
27
+ MODEL_LISTS = ['base', 'large']
28
+
29
+
30
+ # ======================
31
+ # Arguemnt Parser Config
32
+ # ======================
33
+ parser = get_base_parser(
34
+ 'RSource separation.',
35
+ WAV_PATH,
36
+ SAVE_WAV_PATH,
37
+ )
38
+ parser.add_argument(
39
+ '-n', '--onnx',
40
+ action='store_true',
41
+ default=False,
42
+ help='Use onnxruntime'
43
+ )
44
+ parser.add_argument(
45
+ '-st', '--stereo',
46
+ action='store_true',
47
+ default=False,
48
+ help='Use stereo mode'
49
+ )
50
+ parser.add_argument(
51
+ '-a', '--arch',
52
+ default='base', choices=MODEL_LISTS,
53
+ help='model lists: ' + ' | '.join(MODEL_LISTS)
54
+ )
55
+ parser.add_argument(
56
+ '--ailia_audio', action='store_true',
57
+ help='use ailia audio library'
58
+ )
59
+ args = update_parser(parser)
60
+
61
+ if args.ailia_audio:
62
+ import ailia.audio as ailia_audio
63
+ from unet_source_separation_utils_ailia import preemphasis, inv_preemphasis, lowpass, tfconvert, zero_pad, calc_time # noqa: E402
64
+ else:
65
+ from scipy import signal
66
+ from unet_source_separation_utils import preemphasis, inv_preemphasis, lowpass, tfconvert, zero_pad, calc_time # noqa: E402
67
+
68
+
69
+ # ======================
70
+ # Parameters 2
71
+ # ======================
72
+
73
+ if args.arch == 'base' : # for general voice separation
74
+ WEIGHT_PATH = "second_voice_bank.best.opt2.onnx"
75
+ else : # for singing voice separation
76
+ WEIGHT_PATH = "RefineSpectrogramUnet.best.opt.onnx"
77
+ MODEL_PATH = WEIGHT_PATH + ".prototxt"
78
+ REMOTE_PATH = "https://storage.googleapis.com/ailia-models/unet_source_separation/"
79
+
80
+ # fixed parameters for each model
81
+ if args.arch == 'base' :
82
+ DESIRED_SR = 22050
83
+ MULT = 2 ** 5
84
+ WINDOW_LEN = 512
85
+ HOP_LEN = 64
86
+ else :
87
+ DESIRED_SR = 44100
88
+ MULT = 2 ** 6
89
+ WINDOW_LEN = 1024
90
+ HOP_LEN = 128
91
+
92
+ # adjustable parameters
93
+ if args.arch == 'base' :
94
+ LPF_CUTOFF = 10000
95
+ else :
96
+ LPF_CUTOFF = 20000
97
+
98
+
99
+ # ======================
100
+ # Main function
101
+ # ======================
102
+ def src_sep(data, session) :
103
+ # inference
104
+ if not args.onnx :
105
+ sep = session.run(data)[0]
106
+
107
+ else :
108
+ first_input_name = session.get_inputs()[0].name
109
+ second_input_name = session.get_inputs()[1].name
110
+ first_output_name = session.get_outputs()[0].name
111
+ sep = session.run(
112
+ [first_output_name],
113
+ {first_input_name: data[0], second_input_name: data[1]})[0]
114
+
115
+ return sep
116
+
117
+
118
+ def recognize_one_audio(input_path):
119
+ # load audio
120
+ logger.info('Loading wavfile...')
121
+ wav, sr = sf.read(input_path)
122
+
123
+ if wav.dtype != np.float32:
124
+ wav = wav.astype(np.float32)
125
+
126
+ if wav.ndim == 2 :
127
+ if args.stereo:
128
+ wav = np.transpose(wav,(1,0)) # stereo to batch
129
+ else:
130
+ wav = (wav[:,0][np.newaxis,:] + wav[:,1][np.newaxis,:])/2 # convert to mono
131
+ else:
132
+ wav = wav[np.newaxis,:]
133
+
134
+ calc_time(wav.shape[1], sr)
135
+
136
+ # convert sample rate
137
+ logger.info('Converting sample rate...')
138
+ if not sr == DESIRED_SR :
139
+ if args.ailia_audio:
140
+ wav = ailia.audio.resample(wav,sr,DESIRED_SR)
141
+ else:
142
+ wav = signal.resample_poly(wav, DESIRED_SR, sr, axis=1)
143
+
144
+ # apply preenphasis filter
145
+ logger.info('Generating input feature...')
146
+ wav = preemphasis(wav)
147
+
148
+ input_feature = tfconvert(wav, WINDOW_LEN, HOP_LEN, MULT)
149
+
150
+ # create instance
151
+ if not args.onnx :
152
+ logger.info('Use ailia')
153
+ env_id = args.env_id
154
+ logger.info(f'env_id: {env_id}')
155
+ memory_mode = ailia.get_memory_mode(reuse_interstage=True)
156
+ session = ailia.Net(MODEL_PATH, WEIGHT_PATH, env_id=env_id, memory_mode=memory_mode)
157
+ else :
158
+ logger.info('Use onnxruntime')
159
+ import onnxruntime
160
+ session = onnxruntime.InferenceSession(WEIGHT_PATH)
161
+
162
+ # inference
163
+ logger.info('Start inference...')
164
+ if args.benchmark:
165
+ logger.info('BENCHMARK mode')
166
+ for c in range(5) :
167
+ start = int(round(time.time() * 1000))
168
+ sep = src_sep(input_feature, session)
169
+ end = int(round(time.time() * 1000))
170
+ logger.info("\tprocessing time {} ms".format(end-start))
171
+ else:
172
+ sep = src_sep(input_feature, session)
173
+
174
+ # postprocessing
175
+ logger.info('Start postprocessing...')
176
+ if LPF_CUTOFF > 0 :
177
+ sep = lowpass(sep, LPF_CUTOFF, DESIRED_SR)
178
+
179
+ out_wav = inv_preemphasis(sep).clip(-1.,1.)
180
+ out_wav = out_wav.swapaxes(0,1)
181
+
182
+ # save sapareted signal
183
+ savepath = get_savepath(args.savepath, input_path)
184
+ logger.info(f'saved at : {savepath}')
185
+
186
+ sf.write(savepath, out_wav, DESIRED_SR)
187
+
188
+ logger.info('Saved separated signal. ')
189
+ logger.info('Script finished successfully.')
190
+
191
+
192
+ def main():
193
+ # model files check and download
194
+ check_and_download_models(WEIGHT_PATH, MODEL_PATH, REMOTE_PATH)
195
+
196
+ for input_file in args.input:
197
+ recognize_one_audio(input_file)
198
+
199
+ if __name__ == "__main__":
200
+ main()
models/ailia-models/code/unet_source_separation_utils.py ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import numpy as np
3
+ from scipy import signal
4
+
5
+
6
+ # ======================
7
+ # Pre/Post process
8
+ # ======================
9
+ def preemphasis(data, coeff=0.97):
10
+
11
+ return signal.lfilter([1,-coeff], [1], data).astype(np.float32)
12
+
13
+
14
+ def inv_preemphasis(data, coeff=0.97):
15
+
16
+ return signal.lfilter([1], [1,-coeff], data).astype(np.float32)
17
+
18
+
19
+ def lowpass(data, stop_freq, sample_freq, N=4):
20
+ wn = 2.0 * stop_freq / sample_freq
21
+ b, a = signal.butter(N, wn, btype="low")
22
+ data = signal.filtfilt(b,a, data)
23
+
24
+ return data
25
+
26
+
27
+ def tfconvert(x, window_len, hop_len, mult, window='hann') :
28
+ noverlap = window_len - hop_len
29
+ _, _, y = signal.stft(x, window=window, nperseg=window_len, noverlap=noverlap)
30
+
31
+ y_re = np.real(y) * (window_len//2 + 1)
32
+ y_im = np.imag(y) * (window_len//2 + 1)
33
+
34
+ y_mag = np.log(np.sqrt(y_re ** 2 + y_im ** 2)+1.0).astype(np.float32)
35
+ y_phase = np.arctan2(y_im, y_re).astype(np.float32)
36
+
37
+ y_mag = zero_pad(y_mag, mult)
38
+ y_phase = zero_pad(y_phase, mult)
39
+
40
+ return y_mag, y_phase
41
+
42
+
43
+ def zero_pad(x, mult) :
44
+ mod = x.shape[2] % mult
45
+ if mod > 0 :
46
+ pad = mult - mod
47
+ x = np.concatenate(( x, np.zeros((x.shape[0], x.shape[1], pad), dtype=np.float32) ), axis=2)
48
+ return x
49
+
50
+
51
+ def calc_time(sample_len ,sr) :
52
+ quot = sample_len // sr
53
+ rem = (sample_len % sr) / sr
54
+ min = quot // 60
55
+ sec = quot % 60 + rem
56
+ print('Time length : {}min {:.02f}sec'.format(min,sec))
models/ailia-models/code/unet_source_separation_utils_ailia.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import numpy as np
3
+ import ailia.audio as ailia_audio
4
+
5
+
6
+ # ======================
7
+ # Pre/Post process
8
+ # ======================
9
+ def preemphasis(data, coeff=0.97):
10
+
11
+ return ailia_audio.linerfilter(np.array([1,-coeff]), np.array([1]), data).astype(np.float32)
12
+
13
+
14
+ def inv_preemphasis(data, coeff=0.97):
15
+
16
+ return ailia_audio.linerfilter(np.array([1]), np.array([1,-coeff]), data).astype(np.float32)
17
+
18
+
19
+ def lowpass(data, stop_freq, sample_freq, N=4):
20
+ if( stop_freq == 10000 and sample_freq == 22050) or ( stop_freq == 20000 and sample_freq == 44100):
21
+ b = np.array([0.68166451, 2.72665802, 4.08998703, 2.72665802, 0.68166451])
22
+ a = np.array([1. , 3.238043, 3.99120175, 2.21272074, 0.4646666 ])
23
+ else:
24
+ raise ValueError('illegal sample freqency.')
25
+ data = ailia_audio.filterfilter(b,a, data)
26
+
27
+ return data
28
+
29
+
30
+ def tfconvert(x, window_len, hop_len, mult, window='hann') :
31
+ y = ailia_audio.spectrogram(x, fft_n=window_len, hop_n=hop_len, center_mode=2, norm_type="scipy",win_type=window)
32
+
33
+ y_re = np.real(y) * (window_len//2 + 1)
34
+ y_im = np.imag(y) * (window_len//2 + 1)
35
+
36
+ y_mag = np.log(np.sqrt(y_re ** 2 + y_im ** 2)+1.0).astype(np.float32)
37
+ y_phase = np.arctan2(y_im, y_re).astype(np.float32)
38
+
39
+ y_mag = zero_pad(y_mag, mult)
40
+ y_phase = zero_pad(y_phase, mult)
41
+
42
+ return y_mag, y_phase
43
+
44
+
45
+ def zero_pad(x, mult) :
46
+ mod = x.shape[2] % mult
47
+ if mod > 0 :
48
+ pad = mult - mod
49
+ x = np.concatenate(( x, np.zeros((x.shape[0], x.shape[1], pad), dtype=np.float32) ), axis=2)
50
+ return x
51
+
52
+
53
+ def calc_time(sample_len ,sr) :
54
+ quot = sample_len // sr
55
+ rem = (sample_len % sr) / sr
56
+ min = quot // 60
57
+ sec = quot % 60 + rem
58
+ print('Time length : {}min {:.02f}sec'.format(min,sec))
models/ailia-models/second_voice_bank.best.opt.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1abf4fe6881666fb9466d20f1887f7035f95687a05c06ba0c23f9898e3424241
3
+ size 301236944
models/ailia-models/second_voice_bank.best.opt.onnx.prototxt ADDED
The diff for this file is too large to render. See raw diff
 
models/ailia-models/source.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ https://github.com/axinc-ai/ailia-models/tree/master/audio_processing/unet_source_separation
2
+
3
+ https://storage.googleapis.com/ailia-models/unet_source_separation/second_voice_bank.best.opt.onnx
4
+ https://storage.googleapis.com/ailia-models/unet_source_separation/second_voice_bank.best.opt.onnx.prototxt
5
+
6
+ https://storage.googleapis.com/ailia-models/unet_source_separation/RefineSpectrogramUnet.best.opt.onnx
7
+ https://storage.googleapis.com/ailia-models/unet_source_separation/RefineSpectrogramUnet.best.opt.onnx.prototxt