U-Net (models_onnx)
Browse files- .gitattributes +2 -0
- models/ailia-models/RefineSpectrogramUnet.best.opt.onnx +3 -0
- models/ailia-models/RefineSpectrogramUnet.best.opt.onnx.prototxt +0 -0
- models/ailia-models/code/049 - Young Griffo - Facade.wav +3 -0
- models/ailia-models/code/LICENSE +25 -0
- models/ailia-models/code/README.md +71 -0
- models/ailia-models/code/doublenoble_k7rain_part.wav +3 -0
- models/ailia-models/code/requirements.txt +3 -0
- models/ailia-models/code/unet_source_separation.py +200 -0
- models/ailia-models/code/unet_source_separation_utils.py +56 -0
- models/ailia-models/code/unet_source_separation_utils_ailia.py +58 -0
- models/ailia-models/second_voice_bank.best.opt.onnx +3 -0
- models/ailia-models/second_voice_bank.best.opt.onnx.prototxt +0 -0
- models/ailia-models/source.txt +7 -0
.gitattributes
CHANGED
|
@@ -35,3 +35,5 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
Funnel[[:space:]]Deep[[:space:]]Complex[[:space:]]U-Net[[:space:]]for[[:space:]]Phase-Aware[[:space:]]Speech[[:space:]]Enhancement.pdf filter=lfs diff=lfs merge=lfs -text
|
| 37 |
Phase-aware[[:space:]]Speech[[:space:]]Enhancement[[:space:]]with[[:space:]]Deep[[:space:]]Complex[[:space:]]U-Net.pdf filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
| 35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
| 36 |
Funnel[[:space:]]Deep[[:space:]]Complex[[:space:]]U-Net[[:space:]]for[[:space:]]Phase-Aware[[:space:]]Speech[[:space:]]Enhancement.pdf filter=lfs diff=lfs merge=lfs -text
|
| 37 |
Phase-aware[[:space:]]Speech[[:space:]]Enhancement[[:space:]]with[[:space:]]Deep[[:space:]]Complex[[:space:]]U-Net.pdf filter=lfs diff=lfs merge=lfs -text
|
| 38 |
+
models/ailia-models/code/049[[:space:]]-[[:space:]]Young[[:space:]]Griffo[[:space:]]-[[:space:]]Facade.wav filter=lfs diff=lfs merge=lfs -text
|
| 39 |
+
models/ailia-models/code/doublenoble_k7rain_part.wav filter=lfs diff=lfs merge=lfs -text
|
models/ailia-models/RefineSpectrogramUnet.best.opt.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:6ab1e0af3c22250f626379ee4a367687a28547fe3aa186b6a614e1b9dee3b3da
|
| 3 |
+
size 381668080
|
models/ailia-models/RefineSpectrogramUnet.best.opt.onnx.prototxt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
models/ailia-models/code/049 - Young Griffo - Facade.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:d9701789b5d5d6dd89d82cbc146fc70edc127cbec5176be7816079cd06225c91
|
| 3 |
+
size 6867808
|
models/ailia-models/code/LICENSE
ADDED
|
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
BSD 2-Clause License
|
| 2 |
+
|
| 3 |
+
Copyright (c) 2019, ILJI CHOI
|
| 4 |
+
All rights reserved.
|
| 5 |
+
|
| 6 |
+
Redistribution and use in source and binary forms, with or without
|
| 7 |
+
modification, are permitted provided that the following conditions are met:
|
| 8 |
+
|
| 9 |
+
1. Redistributions of source code must retain the above copyright notice, this
|
| 10 |
+
list of conditions and the following disclaimer.
|
| 11 |
+
|
| 12 |
+
2. Redistributions in binary form must reproduce the above copyright notice,
|
| 13 |
+
this list of conditions and the following disclaimer in the documentation
|
| 14 |
+
and/or other materials provided with the distribution.
|
| 15 |
+
|
| 16 |
+
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
|
| 17 |
+
AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
| 18 |
+
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
|
| 19 |
+
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE
|
| 20 |
+
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
| 21 |
+
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
|
| 22 |
+
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
|
| 23 |
+
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
|
| 24 |
+
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
|
| 25 |
+
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
|
models/ailia-models/code/README.md
ADDED
|
@@ -0,0 +1,71 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# source_separation
|
| 2 |
+
|
| 3 |
+
### input
|
| 4 |
+
|
| 5 |
+
- Noisy speech (audio file)
|
| 6 |
+
|
| 7 |
+
```
|
| 8 |
+
Audio from creative commons youtube videos
|
| 9 |
+
https://drive.google.com/drive/folders/19Sn6pe5-BtWXYa6OiLbYGH7iCU-mzB8j
|
| 10 |
+
doublenoble_k7rain_part.wav
|
| 11 |
+
(Original video : https://www.youtube.com/watch?v=vsjB1xTwZ20&t=536s)
|
| 12 |
+
```
|
| 13 |
+
|
| 14 |
+
- Music (audio file)
|
| 15 |
+
```
|
| 16 |
+
DSD100 dataset
|
| 17 |
+
https://sigsep.github.io/datasets/dsd100.html
|
| 18 |
+
049 - Young Griffo - Facade.wav
|
| 19 |
+
```
|
| 20 |
+
|
| 21 |
+
### output
|
| 22 |
+
|
| 23 |
+
Separated voice (audio file)
|
| 24 |
+
```
|
| 25 |
+
separated_voice.wav
|
| 26 |
+
```
|
| 27 |
+
|
| 28 |
+
### Usage
|
| 29 |
+
Automatically downloads the onnx and prototxt files on the first run. It is necessary to be connected to the Internet while downloading.
|
| 30 |
+
|
| 31 |
+
For the sample audio file,
|
| 32 |
+
```bash
|
| 33 |
+
$ python3 unet_source_separation.py
|
| 34 |
+
|
| 35 |
+
```
|
| 36 |
+
|
| 37 |
+
If you want to specify the input audio file, put the input path after the --input option.
|
| 38 |
+
You can use --savepath option to change the name of the output file to save.
|
| 39 |
+
```bash
|
| 40 |
+
$ python3 unet_source_separation.py --input WAV_PATH --savepath SAVE_WAV_PATH
|
| 41 |
+
```
|
| 42 |
+
|
| 43 |
+
You can select a pretrained model by specifying --arch base (default) or --arch large.
|
| 44 |
+
`base` is a model for general voice separation task, and `large` is a model for singing voice separation task.
|
| 45 |
+
```bash
|
| 46 |
+
$ python3 unet_source_separation.py --input WAV_PATH --savepath SAVE_WAV_PATH --arch base
|
| 47 |
+
```
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
### Reference
|
| 51 |
+
|
| 52 |
+
[source_separation](https://github.com/AppleHolic/source_separation)
|
| 53 |
+
|
| 54 |
+
[Singing Voice Separation Samples](https://www.youtube.com/playlist?list=PLQ4ukFz6Ieir5bZYOns08_2gMjt4hYP4I)
|
| 55 |
+
|
| 56 |
+
### Framework
|
| 57 |
+
|
| 58 |
+
PyTorch 1.6.0
|
| 59 |
+
|
| 60 |
+
### Model Format
|
| 61 |
+
|
| 62 |
+
ONNX opset = 11
|
| 63 |
+
|
| 64 |
+
### Netron
|
| 65 |
+
- General voice separation
|
| 66 |
+
|
| 67 |
+
[second_voice_bank.best.opt.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/unet_source_separation/second_voice_bank.best.opt.onnx.prototxt)
|
| 68 |
+
|
| 69 |
+
- Singing voice separation
|
| 70 |
+
|
| 71 |
+
[RefineSpectrogramUnet.best.opt.onnx.prototxt](https://netron.app/?url=https://storage.googleapis.com/ailia-models/unet_source_separation/RefineSpectrogramUnet.best.opt.onnx.prototxt)
|
models/ailia-models/code/doublenoble_k7rain_part.wav
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ac658a1284aa28ea7c77f5126691ef02696fa1bfa41a0a5b41cd9906260bf8dd
|
| 3 |
+
size 11518066
|
models/ailia-models/code/requirements.txt
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
numpy==1.22.0
|
| 2 |
+
soundfile==0.10.3.post1
|
| 3 |
+
scipy==1.10.0
|
models/ailia-models/code/unet_source_separation.py
ADDED
|
@@ -0,0 +1,200 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import time
|
| 2 |
+
import sys
|
| 3 |
+
import argparse
|
| 4 |
+
|
| 5 |
+
import numpy as np
|
| 6 |
+
|
| 7 |
+
import ailia # noqa: E402
|
| 8 |
+
|
| 9 |
+
import soundfile as sf
|
| 10 |
+
|
| 11 |
+
# import original modules
|
| 12 |
+
sys.path.append('../../util')
|
| 13 |
+
from arg_utils import get_base_parser, update_parser, get_savepath # noqa: E402
|
| 14 |
+
from model_utils import check_and_download_models # noqa: E402
|
| 15 |
+
|
| 16 |
+
# logger
|
| 17 |
+
from logging import getLogger # noqa: E402
|
| 18 |
+
logger = getLogger(__name__)
|
| 19 |
+
|
| 20 |
+
|
| 21 |
+
# ======================
|
| 22 |
+
# Parameters 1
|
| 23 |
+
# ======================
|
| 24 |
+
WAV_PATH = 'doublenoble_k7rain_part.wav' # noisy speech sample
|
| 25 |
+
#WAVE_PATH = '049 - Young Griffo - Facade.wav' # music sample
|
| 26 |
+
SAVE_WAV_PATH = 'separated_voice.wav'
|
| 27 |
+
MODEL_LISTS = ['base', 'large']
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
# ======================
|
| 31 |
+
# Arguemnt Parser Config
|
| 32 |
+
# ======================
|
| 33 |
+
parser = get_base_parser(
|
| 34 |
+
'RSource separation.',
|
| 35 |
+
WAV_PATH,
|
| 36 |
+
SAVE_WAV_PATH,
|
| 37 |
+
)
|
| 38 |
+
parser.add_argument(
|
| 39 |
+
'-n', '--onnx',
|
| 40 |
+
action='store_true',
|
| 41 |
+
default=False,
|
| 42 |
+
help='Use onnxruntime'
|
| 43 |
+
)
|
| 44 |
+
parser.add_argument(
|
| 45 |
+
'-st', '--stereo',
|
| 46 |
+
action='store_true',
|
| 47 |
+
default=False,
|
| 48 |
+
help='Use stereo mode'
|
| 49 |
+
)
|
| 50 |
+
parser.add_argument(
|
| 51 |
+
'-a', '--arch',
|
| 52 |
+
default='base', choices=MODEL_LISTS,
|
| 53 |
+
help='model lists: ' + ' | '.join(MODEL_LISTS)
|
| 54 |
+
)
|
| 55 |
+
parser.add_argument(
|
| 56 |
+
'--ailia_audio', action='store_true',
|
| 57 |
+
help='use ailia audio library'
|
| 58 |
+
)
|
| 59 |
+
args = update_parser(parser)
|
| 60 |
+
|
| 61 |
+
if args.ailia_audio:
|
| 62 |
+
import ailia.audio as ailia_audio
|
| 63 |
+
from unet_source_separation_utils_ailia import preemphasis, inv_preemphasis, lowpass, tfconvert, zero_pad, calc_time # noqa: E402
|
| 64 |
+
else:
|
| 65 |
+
from scipy import signal
|
| 66 |
+
from unet_source_separation_utils import preemphasis, inv_preemphasis, lowpass, tfconvert, zero_pad, calc_time # noqa: E402
|
| 67 |
+
|
| 68 |
+
|
| 69 |
+
# ======================
|
| 70 |
+
# Parameters 2
|
| 71 |
+
# ======================
|
| 72 |
+
|
| 73 |
+
if args.arch == 'base' : # for general voice separation
|
| 74 |
+
WEIGHT_PATH = "second_voice_bank.best.opt2.onnx"
|
| 75 |
+
else : # for singing voice separation
|
| 76 |
+
WEIGHT_PATH = "RefineSpectrogramUnet.best.opt.onnx"
|
| 77 |
+
MODEL_PATH = WEIGHT_PATH + ".prototxt"
|
| 78 |
+
REMOTE_PATH = "https://storage.googleapis.com/ailia-models/unet_source_separation/"
|
| 79 |
+
|
| 80 |
+
# fixed parameters for each model
|
| 81 |
+
if args.arch == 'base' :
|
| 82 |
+
DESIRED_SR = 22050
|
| 83 |
+
MULT = 2 ** 5
|
| 84 |
+
WINDOW_LEN = 512
|
| 85 |
+
HOP_LEN = 64
|
| 86 |
+
else :
|
| 87 |
+
DESIRED_SR = 44100
|
| 88 |
+
MULT = 2 ** 6
|
| 89 |
+
WINDOW_LEN = 1024
|
| 90 |
+
HOP_LEN = 128
|
| 91 |
+
|
| 92 |
+
# adjustable parameters
|
| 93 |
+
if args.arch == 'base' :
|
| 94 |
+
LPF_CUTOFF = 10000
|
| 95 |
+
else :
|
| 96 |
+
LPF_CUTOFF = 20000
|
| 97 |
+
|
| 98 |
+
|
| 99 |
+
# ======================
|
| 100 |
+
# Main function
|
| 101 |
+
# ======================
|
| 102 |
+
def src_sep(data, session) :
|
| 103 |
+
# inference
|
| 104 |
+
if not args.onnx :
|
| 105 |
+
sep = session.run(data)[0]
|
| 106 |
+
|
| 107 |
+
else :
|
| 108 |
+
first_input_name = session.get_inputs()[0].name
|
| 109 |
+
second_input_name = session.get_inputs()[1].name
|
| 110 |
+
first_output_name = session.get_outputs()[0].name
|
| 111 |
+
sep = session.run(
|
| 112 |
+
[first_output_name],
|
| 113 |
+
{first_input_name: data[0], second_input_name: data[1]})[0]
|
| 114 |
+
|
| 115 |
+
return sep
|
| 116 |
+
|
| 117 |
+
|
| 118 |
+
def recognize_one_audio(input_path):
|
| 119 |
+
# load audio
|
| 120 |
+
logger.info('Loading wavfile...')
|
| 121 |
+
wav, sr = sf.read(input_path)
|
| 122 |
+
|
| 123 |
+
if wav.dtype != np.float32:
|
| 124 |
+
wav = wav.astype(np.float32)
|
| 125 |
+
|
| 126 |
+
if wav.ndim == 2 :
|
| 127 |
+
if args.stereo:
|
| 128 |
+
wav = np.transpose(wav,(1,0)) # stereo to batch
|
| 129 |
+
else:
|
| 130 |
+
wav = (wav[:,0][np.newaxis,:] + wav[:,1][np.newaxis,:])/2 # convert to mono
|
| 131 |
+
else:
|
| 132 |
+
wav = wav[np.newaxis,:]
|
| 133 |
+
|
| 134 |
+
calc_time(wav.shape[1], sr)
|
| 135 |
+
|
| 136 |
+
# convert sample rate
|
| 137 |
+
logger.info('Converting sample rate...')
|
| 138 |
+
if not sr == DESIRED_SR :
|
| 139 |
+
if args.ailia_audio:
|
| 140 |
+
wav = ailia.audio.resample(wav,sr,DESIRED_SR)
|
| 141 |
+
else:
|
| 142 |
+
wav = signal.resample_poly(wav, DESIRED_SR, sr, axis=1)
|
| 143 |
+
|
| 144 |
+
# apply preenphasis filter
|
| 145 |
+
logger.info('Generating input feature...')
|
| 146 |
+
wav = preemphasis(wav)
|
| 147 |
+
|
| 148 |
+
input_feature = tfconvert(wav, WINDOW_LEN, HOP_LEN, MULT)
|
| 149 |
+
|
| 150 |
+
# create instance
|
| 151 |
+
if not args.onnx :
|
| 152 |
+
logger.info('Use ailia')
|
| 153 |
+
env_id = args.env_id
|
| 154 |
+
logger.info(f'env_id: {env_id}')
|
| 155 |
+
memory_mode = ailia.get_memory_mode(reuse_interstage=True)
|
| 156 |
+
session = ailia.Net(MODEL_PATH, WEIGHT_PATH, env_id=env_id, memory_mode=memory_mode)
|
| 157 |
+
else :
|
| 158 |
+
logger.info('Use onnxruntime')
|
| 159 |
+
import onnxruntime
|
| 160 |
+
session = onnxruntime.InferenceSession(WEIGHT_PATH)
|
| 161 |
+
|
| 162 |
+
# inference
|
| 163 |
+
logger.info('Start inference...')
|
| 164 |
+
if args.benchmark:
|
| 165 |
+
logger.info('BENCHMARK mode')
|
| 166 |
+
for c in range(5) :
|
| 167 |
+
start = int(round(time.time() * 1000))
|
| 168 |
+
sep = src_sep(input_feature, session)
|
| 169 |
+
end = int(round(time.time() * 1000))
|
| 170 |
+
logger.info("\tprocessing time {} ms".format(end-start))
|
| 171 |
+
else:
|
| 172 |
+
sep = src_sep(input_feature, session)
|
| 173 |
+
|
| 174 |
+
# postprocessing
|
| 175 |
+
logger.info('Start postprocessing...')
|
| 176 |
+
if LPF_CUTOFF > 0 :
|
| 177 |
+
sep = lowpass(sep, LPF_CUTOFF, DESIRED_SR)
|
| 178 |
+
|
| 179 |
+
out_wav = inv_preemphasis(sep).clip(-1.,1.)
|
| 180 |
+
out_wav = out_wav.swapaxes(0,1)
|
| 181 |
+
|
| 182 |
+
# save sapareted signal
|
| 183 |
+
savepath = get_savepath(args.savepath, input_path)
|
| 184 |
+
logger.info(f'saved at : {savepath}')
|
| 185 |
+
|
| 186 |
+
sf.write(savepath, out_wav, DESIRED_SR)
|
| 187 |
+
|
| 188 |
+
logger.info('Saved separated signal. ')
|
| 189 |
+
logger.info('Script finished successfully.')
|
| 190 |
+
|
| 191 |
+
|
| 192 |
+
def main():
|
| 193 |
+
# model files check and download
|
| 194 |
+
check_and_download_models(WEIGHT_PATH, MODEL_PATH, REMOTE_PATH)
|
| 195 |
+
|
| 196 |
+
for input_file in args.input:
|
| 197 |
+
recognize_one_audio(input_file)
|
| 198 |
+
|
| 199 |
+
if __name__ == "__main__":
|
| 200 |
+
main()
|
models/ailia-models/code/unet_source_separation_utils.py
ADDED
|
@@ -0,0 +1,56 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
import numpy as np
|
| 3 |
+
from scipy import signal
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
# ======================
|
| 7 |
+
# Pre/Post process
|
| 8 |
+
# ======================
|
| 9 |
+
def preemphasis(data, coeff=0.97):
|
| 10 |
+
|
| 11 |
+
return signal.lfilter([1,-coeff], [1], data).astype(np.float32)
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def inv_preemphasis(data, coeff=0.97):
|
| 15 |
+
|
| 16 |
+
return signal.lfilter([1], [1,-coeff], data).astype(np.float32)
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def lowpass(data, stop_freq, sample_freq, N=4):
|
| 20 |
+
wn = 2.0 * stop_freq / sample_freq
|
| 21 |
+
b, a = signal.butter(N, wn, btype="low")
|
| 22 |
+
data = signal.filtfilt(b,a, data)
|
| 23 |
+
|
| 24 |
+
return data
|
| 25 |
+
|
| 26 |
+
|
| 27 |
+
def tfconvert(x, window_len, hop_len, mult, window='hann') :
|
| 28 |
+
noverlap = window_len - hop_len
|
| 29 |
+
_, _, y = signal.stft(x, window=window, nperseg=window_len, noverlap=noverlap)
|
| 30 |
+
|
| 31 |
+
y_re = np.real(y) * (window_len//2 + 1)
|
| 32 |
+
y_im = np.imag(y) * (window_len//2 + 1)
|
| 33 |
+
|
| 34 |
+
y_mag = np.log(np.sqrt(y_re ** 2 + y_im ** 2)+1.0).astype(np.float32)
|
| 35 |
+
y_phase = np.arctan2(y_im, y_re).astype(np.float32)
|
| 36 |
+
|
| 37 |
+
y_mag = zero_pad(y_mag, mult)
|
| 38 |
+
y_phase = zero_pad(y_phase, mult)
|
| 39 |
+
|
| 40 |
+
return y_mag, y_phase
|
| 41 |
+
|
| 42 |
+
|
| 43 |
+
def zero_pad(x, mult) :
|
| 44 |
+
mod = x.shape[2] % mult
|
| 45 |
+
if mod > 0 :
|
| 46 |
+
pad = mult - mod
|
| 47 |
+
x = np.concatenate(( x, np.zeros((x.shape[0], x.shape[1], pad), dtype=np.float32) ), axis=2)
|
| 48 |
+
return x
|
| 49 |
+
|
| 50 |
+
|
| 51 |
+
def calc_time(sample_len ,sr) :
|
| 52 |
+
quot = sample_len // sr
|
| 53 |
+
rem = (sample_len % sr) / sr
|
| 54 |
+
min = quot // 60
|
| 55 |
+
sec = quot % 60 + rem
|
| 56 |
+
print('Time length : {}min {:.02f}sec'.format(min,sec))
|
models/ailia-models/code/unet_source_separation_utils_ailia.py
ADDED
|
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import sys
|
| 2 |
+
import numpy as np
|
| 3 |
+
import ailia.audio as ailia_audio
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
# ======================
|
| 7 |
+
# Pre/Post process
|
| 8 |
+
# ======================
|
| 9 |
+
def preemphasis(data, coeff=0.97):
|
| 10 |
+
|
| 11 |
+
return ailia_audio.linerfilter(np.array([1,-coeff]), np.array([1]), data).astype(np.float32)
|
| 12 |
+
|
| 13 |
+
|
| 14 |
+
def inv_preemphasis(data, coeff=0.97):
|
| 15 |
+
|
| 16 |
+
return ailia_audio.linerfilter(np.array([1]), np.array([1,-coeff]), data).astype(np.float32)
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
def lowpass(data, stop_freq, sample_freq, N=4):
|
| 20 |
+
if( stop_freq == 10000 and sample_freq == 22050) or ( stop_freq == 20000 and sample_freq == 44100):
|
| 21 |
+
b = np.array([0.68166451, 2.72665802, 4.08998703, 2.72665802, 0.68166451])
|
| 22 |
+
a = np.array([1. , 3.238043, 3.99120175, 2.21272074, 0.4646666 ])
|
| 23 |
+
else:
|
| 24 |
+
raise ValueError('illegal sample freqency.')
|
| 25 |
+
data = ailia_audio.filterfilter(b,a, data)
|
| 26 |
+
|
| 27 |
+
return data
|
| 28 |
+
|
| 29 |
+
|
| 30 |
+
def tfconvert(x, window_len, hop_len, mult, window='hann') :
|
| 31 |
+
y = ailia_audio.spectrogram(x, fft_n=window_len, hop_n=hop_len, center_mode=2, norm_type="scipy",win_type=window)
|
| 32 |
+
|
| 33 |
+
y_re = np.real(y) * (window_len//2 + 1)
|
| 34 |
+
y_im = np.imag(y) * (window_len//2 + 1)
|
| 35 |
+
|
| 36 |
+
y_mag = np.log(np.sqrt(y_re ** 2 + y_im ** 2)+1.0).astype(np.float32)
|
| 37 |
+
y_phase = np.arctan2(y_im, y_re).astype(np.float32)
|
| 38 |
+
|
| 39 |
+
y_mag = zero_pad(y_mag, mult)
|
| 40 |
+
y_phase = zero_pad(y_phase, mult)
|
| 41 |
+
|
| 42 |
+
return y_mag, y_phase
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
def zero_pad(x, mult) :
|
| 46 |
+
mod = x.shape[2] % mult
|
| 47 |
+
if mod > 0 :
|
| 48 |
+
pad = mult - mod
|
| 49 |
+
x = np.concatenate(( x, np.zeros((x.shape[0], x.shape[1], pad), dtype=np.float32) ), axis=2)
|
| 50 |
+
return x
|
| 51 |
+
|
| 52 |
+
|
| 53 |
+
def calc_time(sample_len ,sr) :
|
| 54 |
+
quot = sample_len // sr
|
| 55 |
+
rem = (sample_len % sr) / sr
|
| 56 |
+
min = quot // 60
|
| 57 |
+
sec = quot % 60 + rem
|
| 58 |
+
print('Time length : {}min {:.02f}sec'.format(min,sec))
|
models/ailia-models/second_voice_bank.best.opt.onnx
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:1abf4fe6881666fb9466d20f1887f7035f95687a05c06ba0c23f9898e3424241
|
| 3 |
+
size 301236944
|
models/ailia-models/second_voice_bank.best.opt.onnx.prototxt
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
models/ailia-models/source.txt
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
https://github.com/axinc-ai/ailia-models/tree/master/audio_processing/unet_source_separation
|
| 2 |
+
|
| 3 |
+
https://storage.googleapis.com/ailia-models/unet_source_separation/second_voice_bank.best.opt.onnx
|
| 4 |
+
https://storage.googleapis.com/ailia-models/unet_source_separation/second_voice_bank.best.opt.onnx.prototxt
|
| 5 |
+
|
| 6 |
+
https://storage.googleapis.com/ailia-models/unet_source_separation/RefineSpectrogramUnet.best.opt.onnx
|
| 7 |
+
https://storage.googleapis.com/ailia-models/unet_source_separation/RefineSpectrogramUnet.best.opt.onnx.prototxt
|