niobures commited on
Commit
b508693
·
verified ·
1 Parent(s): 8b6325d

AudioSep (code, models, paper)

Browse files
.gitattributes CHANGED
@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ models/ailia-models/code/input.wav filter=lfs diff=lfs merge=lfs -text
37
+ models/ailia-models/code/output_thunder.wav filter=lfs diff=lfs merge=lfs -text
38
+ models/ailia-models/code/output_waterdrops.wav filter=lfs diff=lfs merge=lfs -text
39
+ Separate[[:space:]]Anything[[:space:]]You[[:space:]]Describe.pdf filter=lfs diff=lfs merge=lfs -text
40
+ Separate[[:space:]]What[[:space:]]You[[:space:]]Describe.[[:space:]]Language-Queried[[:space:]]Audio[[:space:]]Source[[:space:]]Separation.pdf filter=lfs diff=lfs merge=lfs -text
Separate Anything You Describe.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9df16e42f8b2d68e6b14b15979189334a8fc05e63b52a1cba6b8da893dd6711a
3
+ size 13520281
Separate What You Describe. Language-Queried Audio Source Separation.pdf ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:298516c10117a2e9f0657219daf66ad1cb6892ff57965fb7605aa61aedcfc883
3
+ size 7601966
code/AudioSep.zip ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0680f2680e3ee93cc1451e24eb26e6a04fc17b5f75404248e0c7399dc9486350
3
+ size 20054078
models/AudioSep (audo)/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
models/AudioSep (audo)/audiosep_base_4M_steps.ckpt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f8cda01bfd0ebd141eef45d41db7a3ada23a56568465840d3cff04b8010ce82c
3
+ size 1264844076
models/AudioSep (audo)/audioset_textmap.npy ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bada103070d92f9eadd33e1b4f45ec8583f59080ef218c966b43294bd4c86d5b
3
+ size 84448
models/AudioSep (audo)/bpe_simple_vocab_16e6.txt.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:924691ac288e54409236115652ad4aa250f48203de50a9e4722a6ecd48d6804a
3
+ size 1356917
models/AudioSep (audo)/music_speech_audioset_epoch_15_esc_89.98.pt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51c68f12f9d7ea25fdaaccf741ec7f81e93ee594455410f3bca4f47f88d8e006
3
+ size 2352471003
models/AudioSep (audo)/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/audo/AudioSep
models/ailia-models/audiosep_resunet.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:030b1ff2af237e753ed5441c3a11358defaf2a706d7e037af681e4282b0ec179
3
+ size 102583411
models/ailia-models/audiosep_resunet.onnx.prototxt ADDED
The diff for this file is too large to render. See raw diff
 
models/ailia-models/audiosep_text.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f3cb68423900435937ff5c139f1c867ef8d1b0c05e5023c64f3446a24c08d3de
3
+ size 501433356
models/ailia-models/audiosep_text.onnx.prototxt ADDED
The diff for this file is too large to render. See raw diff
 
models/ailia-models/code/LICENSE.txt ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ MIT License
2
+
3
+ Copyright (c) Xubo Liu
4
+
5
+ Permission is hereby granted, free of charge, to any person obtaining a copy
6
+ of this software and associated documentation files (the "Software"), to deal
7
+ in the Software without restriction, including without limitation the rights
8
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9
+ copies of the Software, and to permit persons to whom the Software is
10
+ furnished to do so, subject to the following conditions:
11
+
12
+ The above copyright notice and this permission notice shall be included in all
13
+ copies or substantial portions of the Software.
14
+
15
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21
+ SOFTWARE
models/ailia-models/code/README.md ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # AudioSep: Separate Anything You Describe
2
+
3
+ ## Input
4
+
5
+ * **Mixed audio file**
6
+
7
+ Audio file in wav format with mixed sources. [input.wav](./input.wav)
8
+
9
+ https://github.com/axinc-ai/ailia-models/assets/53651931/4b761212-a1c7-46dc-b598-a08e4c5ab7ff
10
+
11
+ This audio file was adapted from the [official audiosep implementation](https://github.com/Audio-AGI/AudioSep)
12
+
13
+ https://audio-agi.github.io/Separate-Anything-You-Describe/demos/exp31_water/drops_mixture.wav
14
+
15
+ * **Text condition**
16
+
17
+ Text description of the sound source you want to separate.
18
+
19
+ ## Output
20
+
21
+ * **Audio file**
22
+
23
+ Separated audio source according to the text query.
24
+
25
+ Saves to ```./output.wav``` by default but it can be specified with the ```--path``` option
26
+
27
+ ## Usage
28
+ Internet connection is required when running the script for the first time, as the model files will be automatically downloaded.
29
+
30
+ Running this script will separate sound sources from the original input audio file, according to the language query.
31
+
32
+ #### Example1: Extract sound of thunder
33
+ ```bash
34
+ $ python3 audiosep.py -p "thunder" -i input.wav -s output_thunder.wav
35
+ ```
36
+ https://github.com/axinc-ai/ailia-models/assets/53651931/d0d016dd-a808-4eb6-a4b5-9791f8f1bd2f
37
+
38
+ #### Example2: Extract sound of waterdrops
39
+ ```bash
40
+ $ python3 audiosep.py -p "water drops" -i input.wav -s output_waterdrops.wav
41
+ ```
42
+ https://github.com/axinc-ai/ailia-models/assets/53651931/7710b6c9-49dc-4d2a-8489-ccbf7fb45591
43
+
44
+ ```.wav``` file containing the sound source separated from the original mixture will be created in both cases.
45
+
46
+ ## Reference
47
+
48
+ * [AudioSep](https://github.com/Audio-AGI/AudioSep)
49
+ * [Separate Anything You Describe](https://audio-agi.github.io/Separate-Anything-You-Describe/)
50
+
51
+ ## Framework
52
+
53
+ Pytorch
54
+
55
+ ## Model Format
56
+
57
+ ONNX opset=11
58
+
59
+ ## Netron
60
+
61
+ * [audiosep_text.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/audiosep/audiosep_text.onnx.prototxt)
62
+ * [audiosep_resunet.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/audiosep/audiosep_resunet.onnx.prototxt)
63
+
64
+
65
+
models/ailia-models/code/audiosep.py ADDED
@@ -0,0 +1,283 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import time
3
+ import math
4
+ from logging import getLogger
5
+
6
+ import scipy
7
+ import librosa
8
+ import numpy as np
9
+
10
+ import ailia
11
+
12
+ # import original modules
13
+ sys.path.append('../../util')
14
+ from arg_utils import get_base_parser, update_parser, get_savepath # noqa
15
+ from model_utils import check_and_download_models # noqa
16
+
17
+ logger = getLogger(__name__)
18
+
19
+ # ======================
20
+ # Parameters
21
+ # ======================
22
+
23
+ QUERY_WEIGHT_PATH = 'audiosep_text.onnx'
24
+ SEPNET_WEIGHT_PATH = 'audiosep_resunet.onnx'
25
+
26
+ QUERY_MODEL_PATH = 'audiosep_text.onnx.prototxt'
27
+ SEPNET_MODEL_PATH = 'audiosep_resunet.onnx.prototxt'
28
+
29
+ REMOTE_PATH = "https://storage.googleapis.com/ailia-models/audiosep/"
30
+
31
+ # ======================
32
+ # Arguemnt Parser Config
33
+ # ======================
34
+
35
+ parser = get_base_parser(
36
+ 'audiosep', "input.wav", None
37
+ )
38
+
39
+ parser.add_argument(
40
+ "-p", "--prompt", metavar="TEXT", type=str,
41
+ default="water drops",
42
+ help="Text query."
43
+ )
44
+
45
+ parser.add_argument(
46
+ '--disable_ailia_tokenizer',
47
+ action='store_true',
48
+ help='disable ailia tokenizer.'
49
+ )
50
+
51
+ args = update_parser(parser, check_input_type=False)
52
+
53
+ # ======================
54
+ # Helper functions
55
+ # ======================
56
+
57
+ """
58
+ Functions below are taken from https://github.com/Audio-AGI/AudioSep, which was released under MIT license.
59
+ Modified to be run with numpy arrays instead of torch tensors
60
+ """
61
+ def preprocess_mag(mag):
62
+ #batch normalize self.bn0 = nn.BatchNorm2d(window_size // 2 + 1, momentum=momentum)
63
+ mag = np.transpose(mag, (0,3,2,1))
64
+ mag = (mag - np.mean(mag, axis=(2,3), keepdims=True)) / (np.std(mag, axis=(2,3), keepdims=True) + 1e-5)
65
+ mag = np.transpose(mag, (0,3,2,1))
66
+ p = math.ceil(mag.shape[2] / 2**5) * 2**5 - mag.shape[2]
67
+ mag = np.pad(mag, ((0,0),(0,0),(0,p),(0,0)))
68
+ mag = mag[:,:,:,0:mag.shape[-1]-1]
69
+ return mag
70
+
71
+ def spectrogram_phase(input, eps=0.):
72
+ D = librosa.stft(
73
+ input,
74
+ n_fft=2048,
75
+ hop_length=320,
76
+ win_length=2048,
77
+ window='hann',
78
+ center=True,
79
+ pad_mode='reflect'
80
+ )
81
+ real = np.real(D)
82
+ imag = np.imag(D)
83
+ mag = np.clip(real ** 2 + imag ** 2, eps, np.inf) ** 0.5
84
+ cos = real / mag# normalize
85
+ sin = imag / mag# normalize
86
+ return mag, cos, sin
87
+
88
+ def wav_to_spectrogram(input, eps=1e-10):
89
+ """Waveform to spectrogram.
90
+
91
+ Args:
92
+ input: (batch_size, segment_samples, channels_num)
93
+
94
+ Outputs:
95
+ output: (batch_size, channels_num, time_steps, freq_bins)
96
+ """
97
+ sp_list = []
98
+ cos_list = []
99
+ sin_list = []
100
+ channels_num = input.shape[1]
101
+ for channel in range(channels_num):
102
+ mag, cos, sin = spectrogram_phase(input[:, channel, :], eps=eps)
103
+ sp_list.append(mag)
104
+ cos_list.append(cos)
105
+ sin_list.append(sin)
106
+
107
+ sps = np.concatenate(sp_list, axis=1)
108
+ coss = np.concatenate(cos_list, axis=1)
109
+ sins = np.concatenate(sin_list, axis=1)
110
+ return sps, coss, sins
111
+
112
+ def sigmoid(x):
113
+ return 1 / (1 + np.exp(-x))
114
+
115
+ def feature_maps_to_wav(
116
+ input_tensor,
117
+ sp,
118
+ sin_in,
119
+ cos_in,
120
+ audio_length,
121
+ ):
122
+ batch_size, _, time_steps, freq_bins = input_tensor.shape
123
+
124
+ x = input_tensor.reshape(
125
+ batch_size,
126
+ 1,
127
+ 1,
128
+ 3,
129
+ time_steps,
130
+ freq_bins,
131
+ )
132
+ # x: (batch_size, target_sources_num, output_channels, self.K, time_steps, freq_bins)
133
+
134
+ mask_mag = sigmoid(x[:, :, :, 0, :, :])
135
+ _mask_real = np.tanh(x[:, :, :, 1, :, :])
136
+ _mask_imag = np.tanh(x[:, :, :, 2, :, :])
137
+ # linear_mag = torch.tanh(x[:, :, :, 3, :, :])
138
+ _, phase = librosa.magphase(_mask_real + 1j*_mask_imag)
139
+ #norm = (np.real(phase)**2 + np.imag(phase)**2)**0.5
140
+ mask_cos = np.real(phase)
141
+ mask_sin = np.imag(phase)
142
+
143
+ # Y = |Y|cos∠Y + j|Y|sin∠Y
144
+ # = |Y|cos(∠X + ∠M) + j|Y|sin(∠X + ∠M)
145
+ # = |Y|(cos∠X cos∠M - sin∠X sin∠M) + j|Y|(sin∠X cos∠M + cos∠X sin∠M)
146
+ out_cos = (
147
+ cos_in[:, None, :, :, :] * mask_cos - sin_in[:, None, :, :, :] * mask_sin
148
+ )
149
+ out_sin = (
150
+ sin_in[:, None, :, :, :] * mask_cos + cos_in[:, None, :, :, :] * mask_sin
151
+ )
152
+ # out_cos: (batch_size, target_sources_num, output_channels, time_steps, freq_bins)
153
+ # out_sin: (batch_size, target_sources_num, output_channels, time_steps, freq_bins)
154
+
155
+ # Calculate |Y|.
156
+ out_mag = np.max(sp[:, None, :, :, :] * mask_mag, 0)
157
+ # out_mag = F.relu_(sp[:, None, :, :, :] * mask_mag + linear_mag)
158
+ # out_mag: (batch_size, target_sources_num, output_channels, time_steps, freq_bins)
159
+
160
+ # Calculate Y_{real} and Y_{imag} for ISTFT.
161
+ out_real = out_mag * out_cos
162
+ out_imag = out_mag * out_sin
163
+ # out_real, out_imag: (batch_size, target_sources_num, output_channels, time_steps, freq_bins)
164
+
165
+ # Reformat shape to (N, 1, time_steps, freq_bins) for ISTFT where
166
+ # N = batch_size * target_sources_num * output_channels
167
+ shape = (
168
+ batch_size,
169
+ 1,
170
+ time_steps,
171
+ freq_bins,
172
+ )
173
+ out_real = out_real.reshape(shape)
174
+ out_imag = out_imag.reshape(shape)
175
+
176
+ x = librosa.istft(
177
+ (out_real + 1j * out_imag)[0,0].astype('complex64').transpose((1,0)),
178
+ n_fft = 2048,
179
+ hop_length = 320,
180
+ win_length = 2048,
181
+ window = 'hann',
182
+ center = True,
183
+ length = audio_length,
184
+ )
185
+
186
+ return x
187
+
188
+ # ======================
189
+ # Main functions
190
+ # ======================
191
+
192
+ def inference(model, input_text, input_wav):
193
+ # tokenize
194
+ tokenizer = model['tokenizer']
195
+ text_prompt_tkn = dict(tokenizer(input_text, return_tensors = 'np', padding = True))
196
+ text_prompt_tkn = (text_prompt_tkn['input_ids'], text_prompt_tkn['attention_mask'])
197
+
198
+ # prepare audio input
199
+ mag, cosin, sinin = wav_to_spectrogram(input_wav)
200
+ orig_len = mag.shape[-1]
201
+ mag = mag.transpose((0,2,1))[None]
202
+ cosin = cosin.transpose((0,2,1))[None]
203
+ sinin = sinin.transpose((0,2,1))[None]
204
+
205
+ # preprocess
206
+ mag_in = preprocess_mag(mag)
207
+
208
+ # inference
209
+ query = model['querynet'].predict(text_prompt_tkn)[0]
210
+
211
+ output = model['sepnet'].predict((query, mag_in))[0]
212
+
213
+ # postprocess
214
+ output = output[:,:,:orig_len,:]# trim to original length
215
+
216
+ output_wav = feature_maps_to_wav(output, mag, sinin, cosin, input_wav.shape[-1])
217
+
218
+ return output_wav
219
+
220
+ def split_audio(model):
221
+ input_text = args.prompt
222
+ input_wav = librosa.load(args.input[0], sr=32000, mono=True)[0][None,None,:]
223
+
224
+ logger.info("input_text: %s" % input_text)
225
+
226
+ # inference
227
+ logger.info('inference has started...')
228
+ if args.benchmark:
229
+ logger.info('BENCHMARK mode')
230
+ total_time_estimation = 0
231
+ for i in range(args.benchmark_count):
232
+ start = int(round(time.time() * 1000))
233
+ output = inference(model, input_text, input_wav)
234
+ end = int(round(time.time() * 1000))
235
+ estimation_time = (end - start)
236
+
237
+ # Logging
238
+ logger.info(f'\tailia processing estimation time {estimation_time} ms')
239
+ if i != 0:
240
+ total_time_estimation = total_time_estimation + estimation_time
241
+
242
+ logger.info(f'\taverage time estimation {total_time_estimation / (args.benchmark_count - 1)} ms')
243
+ else:
244
+ output = inference(model, input_text, input_wav)
245
+
246
+ # save output
247
+ if args.savepath is None:
248
+ sp = 'output.wav'
249
+ else:
250
+ sp = args.savepath
251
+ scipy.io.wavfile.write(sp, 32000, np.round(output * 32767).astype(np.int16))
252
+
253
+ logger.info(f"Separated audio has been saved to {sp}")
254
+
255
+ logger.info('Script finished successfully.')
256
+
257
+
258
+ def main():
259
+ # model files check and download
260
+ check_and_download_models(QUERY_WEIGHT_PATH, QUERY_MODEL_PATH, REMOTE_PATH)
261
+ check_and_download_models(SEPNET_WEIGHT_PATH, SEPNET_MODEL_PATH, REMOTE_PATH)
262
+
263
+ env_id = args.env_id
264
+
265
+ # initialize
266
+ querynet = ailia.Net(None, QUERY_WEIGHT_PATH, env_id=env_id)
267
+ sepnet = ailia.Net(None, SEPNET_WEIGHT_PATH)
268
+ if args.disable_ailia_tokenizer:
269
+ from transformers import RobertaTokenizer
270
+ tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
271
+ else:
272
+ import ailia_tokenizer
273
+ tokenizer = ailia_tokenizer.RobertaTokenizer.from_pretrained('./tokenizer/')
274
+ model = {
275
+ 'querynet': querynet,
276
+ 'sepnet':sepnet,
277
+ 'tokenizer':tokenizer
278
+ }
279
+
280
+ split_audio(model)
281
+
282
+ if __name__ == '__main__':
283
+ main()
models/ailia-models/code/input.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:228089ef063480d966c1003b9c4fd363c972d92e28e8ae9f229b3ac5a293c25a
3
+ size 320044
models/ailia-models/code/output_thunder.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a9588903b7d37571be22cdd2aa7ebe92ca101a9336a34e43dce7505f6cd133ef
3
+ size 320044
models/ailia-models/code/output_waterdrops.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cf50edbdce182145010a6fbe88f61231d468e2d71375a2ec39499159eb852c3b
3
+ size 320044
models/ailia-models/code/tokenizer/merges.txt ADDED
The diff for this file is too large to render. See raw diff
 
models/ailia-models/code/tokenizer/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
models/ailia-models/code/tokenizer/tokenizer_config.json ADDED
@@ -0,0 +1 @@
 
 
1
+ {"model_max_length": 512}
models/ailia-models/code/tokenizer/vocab.json ADDED
The diff for this file is too large to render. See raw diff
 
models/ailia-models/source.txt ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ https://github.com/axinc-ai/ailia-models/tree/master/audio_processing/audiosep
2
+
3
+ https://storage.googleapis.com/ailia-models/audiosep/audiosep_text.onnx
4
+ https://storage.googleapis.com/ailia-models/audiosep/audiosep_text.onnx.prototxt
5
+
6
+ https://storage.googleapis.com/ailia-models/audiosep/audiosep_resunet.onnx
7
+ https://storage.googleapis.com/ailia-models/audiosep/audiosep_resunet.onnx.prototxt
models/audiosep-demo/.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
models/audiosep-demo/README.md ADDED
@@ -0,0 +1,9 @@
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apache-2.0
3
+ ---
4
+ ## AudioSep model
5
+
6
+ This model was proposed in ["Separate Anything You Describe"](https://audio-agi.github.io/Separate-Anything-You-Describe/).
7
+
8
+
9
+ ![image/png](https://cdn-uploads.huggingface.co/production/uploads/5f1158120c833276f61f1a84/X1Src5PFpfPvcEpj9yVih.png)
models/audiosep-demo/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:37f1691fb067e2575f1ad1cfbfe44b7b3da18e52f33fcb2b0937b72952f11ba1
3
+ size 957134817
models/audiosep-demo/source.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ https://huggingface.co/nielsr/audiosep-demo