Spaces:
Running
Running
Commit
·
9e538da
1
Parent(s):
83d6d79
tenth commit
Browse files- app.py +10 -6
- prepro/save_musdb_XL_train_numpy.py +148 -0
- prepro/save_musdb_XL_train_wave.py +145 -0
app.py
CHANGED
|
@@ -207,7 +207,7 @@ with gr.Blocks() as demo:
|
|
| 207 |
</div>
|
| 208 |
<p style="margin-bottom: 10px; font-size: 94%">
|
| 209 |
A demo for "Music De-limiter via Sample-wise Gain Inversion" to appear in WASPAA 2023.
|
| 210 |
-
You can first upload a music (.wav or .mp3) file and then press "De-limit" button to apply the De-limiter. Since we use a CPU instead of a GPU, it may require a few minute.
|
| 211 |
Then, you can apply a Parallel Mix technique, which is a simple linear mixing technique of "loudness normalized input" and the "de-limiter output".
|
| 212 |
You can modify the mixing coefficient by yourself.
|
| 213 |
If the coefficient is 0.3 then the output will be "loudness_normalized_input * 0.3 + de-limiter_output * 0.7"
|
|
@@ -221,11 +221,15 @@ with gr.Blocks() as demo:
|
|
| 221 |
btn = gr.Button("De-limit")
|
| 222 |
with gr.Column():
|
| 223 |
with gr.Box():
|
| 224 |
-
loud_norm_input = gr.Audio(
|
| 225 |
-
|
|
|
|
|
|
|
| 226 |
with gr.Box():
|
| 227 |
-
output_audio = gr.Audio(
|
| 228 |
-
|
|
|
|
|
|
|
| 229 |
with gr.Box():
|
| 230 |
output_audio_parallel = gr.Audio(
|
| 231 |
label="Parallel Mix of the Input and its De-limiter Output",
|
|
@@ -278,6 +282,6 @@ with gr.Blocks() as demo:
|
|
| 278 |
],
|
| 279 |
outputs=plot,
|
| 280 |
)
|
| 281 |
-
|
| 282 |
if __name__ == "__main__":
|
| 283 |
demo.launch(debug=True)
|
|
|
|
| 207 |
</div>
|
| 208 |
<p style="margin-bottom: 10px; font-size: 94%">
|
| 209 |
A demo for "Music De-limiter via Sample-wise Gain Inversion" to appear in WASPAA 2023.
|
| 210 |
+
You can first upload a music (.wav or .mp3, 44.1kHz) file and then press "De-limit" button to apply the De-limiter. Since we use a CPU instead of a GPU, it may require a few minute.
|
| 211 |
Then, you can apply a Parallel Mix technique, which is a simple linear mixing technique of "loudness normalized input" and the "de-limiter output".
|
| 212 |
You can modify the mixing coefficient by yourself.
|
| 213 |
If the coefficient is 0.3 then the output will be "loudness_normalized_input * 0.3 + de-limiter_output * 0.7"
|
|
|
|
| 221 |
btn = gr.Button("De-limit")
|
| 222 |
with gr.Column():
|
| 223 |
with gr.Box():
|
| 224 |
+
loud_norm_input = gr.Audio(
|
| 225 |
+
label="Loudness Normalized Input (-14LUFS)",
|
| 226 |
+
show_download_button=True,
|
| 227 |
+
)
|
| 228 |
with gr.Box():
|
| 229 |
+
output_audio = gr.Audio(
|
| 230 |
+
label="De-limiter Output",
|
| 231 |
+
show_download_button=True,
|
| 232 |
+
)
|
| 233 |
with gr.Box():
|
| 234 |
output_audio_parallel = gr.Audio(
|
| 235 |
label="Parallel Mix of the Input and its De-limiter Output",
|
|
|
|
| 282 |
],
|
| 283 |
outputs=plot,
|
| 284 |
)
|
| 285 |
+
|
| 286 |
if __name__ == "__main__":
|
| 287 |
demo.launch(debug=True)
|
prepro/save_musdb_XL_train_numpy.py
ADDED
|
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import glob
|
| 3 |
+
import argparse
|
| 4 |
+
import csv
|
| 5 |
+
|
| 6 |
+
import numpy as np
|
| 7 |
+
import librosa
|
| 8 |
+
import soundfile as sf
|
| 9 |
+
import tqdm
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
def main():
|
| 13 |
+
parser = argparse.ArgumentParser(
|
| 14 |
+
description="Save sample-wise gain parameters for dataset distribution"
|
| 15 |
+
)
|
| 16 |
+
parser.add_argument(
|
| 17 |
+
"--root",
|
| 18 |
+
type=str,
|
| 19 |
+
default="/path/to/musdb18hq",
|
| 20 |
+
help="Root directory",
|
| 21 |
+
)
|
| 22 |
+
parser.add_argument(
|
| 23 |
+
"--musdb_XL_train_root",
|
| 24 |
+
type=str,
|
| 25 |
+
default="/path/to/musdb-XL-train",
|
| 26 |
+
help="Directory of musdb-XL-train dataset",
|
| 27 |
+
)
|
| 28 |
+
parser.add_argument(
|
| 29 |
+
"--output",
|
| 30 |
+
type=str,
|
| 31 |
+
default="/path/to/musdb-XL-train/np_ratio",
|
| 32 |
+
help="Directory to save sample-wise gain ratio",
|
| 33 |
+
)
|
| 34 |
+
|
| 35 |
+
args = parser.parse_args()
|
| 36 |
+
|
| 37 |
+
sources = ["vocals", "bass", "drums", "other"]
|
| 38 |
+
|
| 39 |
+
path_csv_fixed = f"{args.musdb_XL_train_root}/ozone_train_fixed.csv"
|
| 40 |
+
list_path_csv_random = sorted(
|
| 41 |
+
glob.glob(f"{args.musdb_XL_train_root}/ozone_train_random_*.csv")
|
| 42 |
+
)
|
| 43 |
+
|
| 44 |
+
# read ozone_train_fixed list
|
| 45 |
+
fixed_list = []
|
| 46 |
+
os.makedirs(f"{args.output}/ozone_train_fixed", exist_ok=True)
|
| 47 |
+
with open(path_csv_fixed, "r", encoding="utf-8") as f:
|
| 48 |
+
rdr = csv.reader(f)
|
| 49 |
+
for k, line in enumerate(rdr):
|
| 50 |
+
if k == 0: # song_name, max_threshold, max_character
|
| 51 |
+
pass
|
| 52 |
+
else:
|
| 53 |
+
fixed_list.append(line)
|
| 54 |
+
|
| 55 |
+
# save numpy files of ozone_train_fixed
|
| 56 |
+
# which is the limiter-applied version of 100 songs from musdb-HQ train set
|
| 57 |
+
# each numpy file contain sample-wise gain ratio parameters
|
| 58 |
+
for fixed_song in tqdm.tqdm(fixed_list):
|
| 59 |
+
audio_sources = []
|
| 60 |
+
for source in sources:
|
| 61 |
+
audio, sr = librosa.load(
|
| 62 |
+
f"{args.root}/train/{fixed_song[0]}/{source}.wav", sr=44100, mono=False
|
| 63 |
+
)
|
| 64 |
+
audio_sources.append(audio)
|
| 65 |
+
stems = np.stack(audio_sources, axis=0)
|
| 66 |
+
mixture = stems.sum(0)
|
| 67 |
+
|
| 68 |
+
ozone_mixture, sr = librosa.load(
|
| 69 |
+
f"{args.musdb_XL_train_root}/ozone_train_fixed/{fixed_song[0]}.wav",
|
| 70 |
+
sr=44100,
|
| 71 |
+
mono=False,
|
| 72 |
+
)
|
| 73 |
+
mixture[mixture == 0.0] = np.finfo(np.float32).eps # to avoid 'divided by zero'
|
| 74 |
+
ratio = ozone_mixture / mixture
|
| 75 |
+
|
| 76 |
+
np.save(
|
| 77 |
+
f"{args.output}/ozone_train_fixed/{fixed_song[0]}.npy",
|
| 78 |
+
ratio.astype(np.float16), # 16bit is enough...
|
| 79 |
+
)
|
| 80 |
+
|
| 81 |
+
# read ozone_train_random list
|
| 82 |
+
random_list = []
|
| 83 |
+
os.makedirs(f"{args.output}/ozone_train_random", exist_ok=True)
|
| 84 |
+
for path_csv_random in list_path_csv_random:
|
| 85 |
+
with open(path_csv_random, "r", encoding="utf-8") as f:
|
| 86 |
+
rdr = csv.reader(f)
|
| 87 |
+
for k, line in enumerate(rdr):
|
| 88 |
+
if k == 0:
|
| 89 |
+
# ['song_name',
|
| 90 |
+
# 'max_threshold',
|
| 91 |
+
# 'max_character',
|
| 92 |
+
# 'vocals_name',
|
| 93 |
+
# 'vocals_start_sec',
|
| 94 |
+
# 'vocals_gain',
|
| 95 |
+
# 'vocals_channelswap',
|
| 96 |
+
# 'bass_name',
|
| 97 |
+
# 'bass_start_sec',
|
| 98 |
+
# 'bass_gain',
|
| 99 |
+
# 'bass_channelswap',
|
| 100 |
+
# 'drums_name',
|
| 101 |
+
# 'drums_start_sec',
|
| 102 |
+
# 'drums_gain',
|
| 103 |
+
# 'drums_channelswap',
|
| 104 |
+
# 'other_name',
|
| 105 |
+
# 'other_start_sec',
|
| 106 |
+
# 'other_gain',
|
| 107 |
+
# 'other_channelswap']
|
| 108 |
+
pass
|
| 109 |
+
else:
|
| 110 |
+
random_list.append(line)
|
| 111 |
+
|
| 112 |
+
# save wave files of ozone_train_random,
|
| 113 |
+
# which is the limiter-applied version of 4-sec 300,000 segments randomly created from musdb-HQ train subset
|
| 114 |
+
for random_song in tqdm.tqdm(random_list):
|
| 115 |
+
audio_sources = []
|
| 116 |
+
for k, source in enumerate(sources):
|
| 117 |
+
audio, sr = librosa.load(
|
| 118 |
+
f"{args.root}/train/{random_song[3 + k * 4]}/{source}.wav",
|
| 119 |
+
sr=44100,
|
| 120 |
+
mono=False,
|
| 121 |
+
offset=float(random_song[4 + k * 4]), # 'inst_start_sec'
|
| 122 |
+
duration=4.0,
|
| 123 |
+
)
|
| 124 |
+
audio = audio * float(random_song[5 + k * 4]) # 'inst_gain'
|
| 125 |
+
if random_song[6 + k * 4].lower() == "true": # 'inst_channelswap'
|
| 126 |
+
audio = np.flip(audio, axis=0)
|
| 127 |
+
|
| 128 |
+
audio_sources.append(audio)
|
| 129 |
+
stems = np.stack(audio_sources, axis=0)
|
| 130 |
+
mixture = stems.sum(0)
|
| 131 |
+
|
| 132 |
+
ozone_mixture, sr = librosa.load(
|
| 133 |
+
f"{args.musdb_XL_train_root}/ozone_train_random/{random_song[0]}.wav",
|
| 134 |
+
sr=44100,
|
| 135 |
+
mono=False,
|
| 136 |
+
)
|
| 137 |
+
|
| 138 |
+
mixture[mixture == 0.0] = np.finfo(np.float32).eps # to avoid 'divided by zero'
|
| 139 |
+
ratio = ozone_mixture / mixture
|
| 140 |
+
|
| 141 |
+
np.save(
|
| 142 |
+
f"{args.output}/ozone_train_random/{random_song[0]}.npy",
|
| 143 |
+
ratio.astype(np.float16), # 16bit is enough...
|
| 144 |
+
)
|
| 145 |
+
|
| 146 |
+
|
| 147 |
+
if __name__ == "__main__":
|
| 148 |
+
main()
|
prepro/save_musdb_XL_train_wave.py
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# Save musdb-XL-train dataset from numpy
|
| 2 |
+
import os
|
| 3 |
+
import glob
|
| 4 |
+
import argparse
|
| 5 |
+
import csv
|
| 6 |
+
|
| 7 |
+
import numpy as np
|
| 8 |
+
import librosa
|
| 9 |
+
import soundfile as sf
|
| 10 |
+
import tqdm
|
| 11 |
+
|
| 12 |
+
|
| 13 |
+
def main():
|
| 14 |
+
parser = argparse.ArgumentParser(
|
| 15 |
+
description="Save musdb-XL-train wave files from the downloaded sample-wise gain parameters"
|
| 16 |
+
)
|
| 17 |
+
parser.add_argument(
|
| 18 |
+
"--root",
|
| 19 |
+
type=str,
|
| 20 |
+
default="/path/to/musdb18hq",
|
| 21 |
+
help="Root directory",
|
| 22 |
+
)
|
| 23 |
+
parser.add_argument(
|
| 24 |
+
"--musdb_XL_train_npy_root",
|
| 25 |
+
type=str,
|
| 26 |
+
default="/path/to/musdb-XL-train",
|
| 27 |
+
help="Directory of numpy arrays of musdb-XL-train's sample-wise ratio ",
|
| 28 |
+
)
|
| 29 |
+
parser.add_argument(
|
| 30 |
+
"--output",
|
| 31 |
+
type=str,
|
| 32 |
+
default="/path/to/musdb-XL-train",
|
| 33 |
+
help="Directory to save musdb-XL-train wave data",
|
| 34 |
+
)
|
| 35 |
+
|
| 36 |
+
args = parser.parse_args()
|
| 37 |
+
|
| 38 |
+
sources = ["vocals", "bass", "drums", "other"]
|
| 39 |
+
|
| 40 |
+
path_csv_fixed = f"{args.musdb_XL_train_npy_root}/ozone_train_fixed.csv"
|
| 41 |
+
list_path_csv_random = sorted(
|
| 42 |
+
glob.glob(f"{args.musdb_XL_train_npy_root}/ozone_train_random_*.csv")
|
| 43 |
+
)
|
| 44 |
+
|
| 45 |
+
# read ozone_train_fixed list
|
| 46 |
+
fixed_list = []
|
| 47 |
+
os.makedirs(f"{args.output}/ozone_train_fixed", exist_ok=True)
|
| 48 |
+
with open(path_csv_fixed, "r", encoding="utf-8") as f:
|
| 49 |
+
rdr = csv.reader(f)
|
| 50 |
+
for k, line in enumerate(rdr):
|
| 51 |
+
if k == 0: # song_name, max_threshold, max_character
|
| 52 |
+
pass
|
| 53 |
+
else:
|
| 54 |
+
fixed_list.append(line)
|
| 55 |
+
|
| 56 |
+
# save wave files of ozone_train_fixed,
|
| 57 |
+
# which is the limiter-applied version of 100 songs from musdb-HQ train set
|
| 58 |
+
for fixed_song in tqdm.tqdm(fixed_list):
|
| 59 |
+
audio_sources = []
|
| 60 |
+
for source in sources:
|
| 61 |
+
audio, sr = librosa.load(
|
| 62 |
+
f"{args.root}/train/{fixed_song[0]}/{source}.wav", sr=44100, mono=False
|
| 63 |
+
)
|
| 64 |
+
audio_sources.append(audio)
|
| 65 |
+
stems = np.stack(audio_sources, axis=0)
|
| 66 |
+
mixture = stems.sum(0)
|
| 67 |
+
|
| 68 |
+
ratio = np.load(
|
| 69 |
+
f"{args.musdb_XL_train_npy_root}/np_ratio/ozone_train_fixed/{fixed_song[0]}.npy"
|
| 70 |
+
)
|
| 71 |
+
output = mixture * ratio
|
| 72 |
+
|
| 73 |
+
sf.write(
|
| 74 |
+
f"{args.output}/ozone_train_fixed/{fixed_song[0]}.wav",
|
| 75 |
+
output.T,
|
| 76 |
+
44100,
|
| 77 |
+
subtype="PCM_16",
|
| 78 |
+
)
|
| 79 |
+
|
| 80 |
+
# read ozone_train_random list
|
| 81 |
+
random_list = []
|
| 82 |
+
os.makedirs(f"{args.output}/ozone_train_random", exist_ok=True)
|
| 83 |
+
for path_csv_random in list_path_csv_random:
|
| 84 |
+
with open(path_csv_random, "r", encoding="utf-8") as f:
|
| 85 |
+
rdr = csv.reader(f)
|
| 86 |
+
for k, line in enumerate(rdr):
|
| 87 |
+
if k == 0:
|
| 88 |
+
# ['song_name',
|
| 89 |
+
# 'max_threshold',
|
| 90 |
+
# 'max_character',
|
| 91 |
+
# 'vocals_name',
|
| 92 |
+
# 'vocals_start_sec',
|
| 93 |
+
# 'vocals_gain',
|
| 94 |
+
# 'vocals_channelswap',
|
| 95 |
+
# 'bass_name',
|
| 96 |
+
# 'bass_start_sec',
|
| 97 |
+
# 'bass_gain',
|
| 98 |
+
# 'bass_channelswap',
|
| 99 |
+
# 'drums_name',
|
| 100 |
+
# 'drums_start_sec',
|
| 101 |
+
# 'drums_gain',
|
| 102 |
+
# 'drums_channelswap',
|
| 103 |
+
# 'other_name',
|
| 104 |
+
# 'other_start_sec',
|
| 105 |
+
# 'other_gain',
|
| 106 |
+
# 'other_channelswap']
|
| 107 |
+
pass
|
| 108 |
+
else:
|
| 109 |
+
random_list.append(line)
|
| 110 |
+
|
| 111 |
+
# save wave files of ozone_train_random,
|
| 112 |
+
# which is the limiter-applied version of 4-sec 300,000 segments randomly created from musdb-HQ train subset
|
| 113 |
+
for random_song in tqdm.tqdm(random_list):
|
| 114 |
+
audio_sources = []
|
| 115 |
+
for k, source in enumerate(sources):
|
| 116 |
+
audio, sr = librosa.load(
|
| 117 |
+
f"{args.root}/train/{random_song[3 + k * 4]}/{source}.wav",
|
| 118 |
+
sr=44100,
|
| 119 |
+
mono=False,
|
| 120 |
+
offset=float(random_song[4 + k * 4]), # 'inst_start_sec'
|
| 121 |
+
duration=4.0,
|
| 122 |
+
)
|
| 123 |
+
audio = audio * float(random_song[5 + k * 4]) # 'inst_gain'
|
| 124 |
+
if random_song[6 + k * 4].lower() == "true": # 'inst_channelswap'
|
| 125 |
+
audio = np.flip(audio, axis=0)
|
| 126 |
+
|
| 127 |
+
audio_sources.append(audio)
|
| 128 |
+
stems = np.stack(audio_sources, axis=0)
|
| 129 |
+
mixture = stems.sum(0)
|
| 130 |
+
|
| 131 |
+
ratio = np.load(
|
| 132 |
+
f"{args.musdb_XL_train_npy_root}/np_ratio/ozone_train_random/{random_song[0]}.npy"
|
| 133 |
+
)
|
| 134 |
+
output = mixture * ratio
|
| 135 |
+
|
| 136 |
+
sf.write(
|
| 137 |
+
f"{args.output}/ozone_train_random/{random_song[0]}.wav",
|
| 138 |
+
output.T,
|
| 139 |
+
44100,
|
| 140 |
+
subtype="PCM_16",
|
| 141 |
+
)
|
| 142 |
+
|
| 143 |
+
|
| 144 |
+
if __name__ == "__main__":
|
| 145 |
+
main()
|