Spaces:

jeonchangbin49
/

De-limiter

Running

App Files Files Community

jeonchangbin49 commited on Aug 2, 2023

Commit

9e538da

1 Parent(s): 83d6d79

tenth commit

Browse files

Files changed (3) hide show

app.py +10 -6
prepro/save_musdb_XL_train_numpy.py +148 -0
prepro/save_musdb_XL_train_wave.py +145 -0

app.py CHANGED Viewed

@@ -207,7 +207,7 @@ with gr.Blocks() as demo:
               </div>
               <p style="margin-bottom: 10px; font-size: 94%">
                 A demo for "Music De-limiter via Sample-wise Gain Inversion" to appear in WASPAA 2023.
-                You can first upload a music (.wav or .mp3) file and then press "De-limit" button to apply the De-limiter. Since we use a CPU instead of a GPU, it may require a few minute.
                 Then, you can apply a Parallel Mix technique, which is a simple linear mixing technique of "loudness normalized input" and the "de-limiter output".
                 You can modify the mixing coefficient by yourself.
                 If the coefficient is 0.3 then the output will be "loudness_normalized_input * 0.3 + de-limiter_output * 0.7"
@@ -221,11 +221,15 @@ with gr.Blocks() as demo:
                 btn = gr.Button("De-limit")
         with gr.Column():
             with gr.Box():
-                loud_norm_input = gr.Audio(label="Loudness Normalized Input (-14LUFS)",
-                                           show_download_button=True)
             with gr.Box():
-                output_audio = gr.Audio(label="De-limiter Output",
-                                        show_download_button=True,)
             with gr.Box():
                 output_audio_parallel = gr.Audio(
                     label="Parallel Mix of the Input and its De-limiter Output",
@@ -278,6 +282,6 @@ with gr.Blocks() as demo:
                     ],
                     outputs=plot,
                 )
 if __name__ == "__main__":
     demo.launch(debug=True)

               </div>
               <p style="margin-bottom: 10px; font-size: 94%">
                 A demo for "Music De-limiter via Sample-wise Gain Inversion" to appear in WASPAA 2023.
+                You can first upload a music (.wav or .mp3, 44.1kHz) file and then press "De-limit" button to apply the De-limiter. Since we use a CPU instead of a GPU, it may require a few minute.
                 Then, you can apply a Parallel Mix technique, which is a simple linear mixing technique of "loudness normalized input" and the "de-limiter output".
                 You can modify the mixing coefficient by yourself.
                 If the coefficient is 0.3 then the output will be "loudness_normalized_input * 0.3 + de-limiter_output * 0.7"
                 btn = gr.Button("De-limit")
         with gr.Column():
             with gr.Box():
+                loud_norm_input = gr.Audio(
+                    label="Loudness Normalized Input (-14LUFS)",
+                    show_download_button=True,
+                )
             with gr.Box():
+                output_audio = gr.Audio(
+                    label="De-limiter Output",
+                    show_download_button=True,
+                )
             with gr.Box():
                 output_audio_parallel = gr.Audio(
                     label="Parallel Mix of the Input and its De-limiter Output",
                     ],
                     outputs=plot,
                 )
 if __name__ == "__main__":
     demo.launch(debug=True)

prepro/save_musdb_XL_train_numpy.py ADDED Viewed

	@@ -0,0 +1,148 @@

+import os
+import glob
+import argparse
+import csv
+import numpy as np
+import librosa
+import soundfile as sf
+import tqdm
+def main():
+    parser = argparse.ArgumentParser(
+        description="Save sample-wise gain parameters for dataset distribution"
+    )
+    parser.add_argument(
+        "--root",
+        type=str,
+        default="/path/to/musdb18hq",
+        help="Root directory",
+    )
+    parser.add_argument(
+        "--musdb_XL_train_root",
+        type=str,
+        default="/path/to/musdb-XL-train",
+        help="Directory of musdb-XL-train dataset",
+    )
+    parser.add_argument(
+        "--output",
+        type=str,
+        default="/path/to/musdb-XL-train/np_ratio",
+        help="Directory to save sample-wise gain ratio",
+    )
+    args = parser.parse_args()
+    sources = ["vocals", "bass", "drums", "other"]
+    path_csv_fixed = f"{args.musdb_XL_train_root}/ozone_train_fixed.csv"
+    list_path_csv_random = sorted(
+        glob.glob(f"{args.musdb_XL_train_root}/ozone_train_random_*.csv")
+    )
+    # read ozone_train_fixed list
+    fixed_list = []
+    os.makedirs(f"{args.output}/ozone_train_fixed", exist_ok=True)
+    with open(path_csv_fixed, "r", encoding="utf-8") as f:
+        rdr = csv.reader(f)
+        for k, line in enumerate(rdr):
+            if k == 0:  # song_name, max_threshold, max_character
+                pass
+            else:
+                fixed_list.append(line)
+    # save numpy files of ozone_train_fixed
+    # which is the limiter-applied version of 100 songs from musdb-HQ train set
+    # each numpy file contain sample-wise gain ratio parameters
+    for fixed_song in tqdm.tqdm(fixed_list):
+        audio_sources = []
+        for source in sources:
+            audio, sr = librosa.load(
+                f"{args.root}/train/{fixed_song[0]}/{source}.wav", sr=44100, mono=False
+            )
+            audio_sources.append(audio)
+        stems = np.stack(audio_sources, axis=0)
+        mixture = stems.sum(0)
+        ozone_mixture, sr = librosa.load(
+            f"{args.musdb_XL_train_root}/ozone_train_fixed/{fixed_song[0]}.wav",
+            sr=44100,
+            mono=False,
+        )
+        mixture[mixture == 0.0] = np.finfo(np.float32).eps  # to avoid 'divided by zero'
+        ratio = ozone_mixture / mixture
+        np.save(
+            f"{args.output}/ozone_train_fixed/{fixed_song[0]}.npy",
+            ratio.astype(np.float16),  # 16bit is enough...
+        )
+    # read ozone_train_random list
+    random_list = []
+    os.makedirs(f"{args.output}/ozone_train_random", exist_ok=True)
+    for path_csv_random in list_path_csv_random:
+        with open(path_csv_random, "r", encoding="utf-8") as f:
+            rdr = csv.reader(f)
+            for k, line in enumerate(rdr):
+                if k == 0:
+                    # ['song_name',
+                    #  'max_threshold',
+                    #  'max_character',
+                    #  'vocals_name',
+                    #  'vocals_start_sec',
+                    #  'vocals_gain',
+                    #  'vocals_channelswap',
+                    #  'bass_name',
+                    #  'bass_start_sec',
+                    #  'bass_gain',
+                    #  'bass_channelswap',
+                    #  'drums_name',
+                    #  'drums_start_sec',
+                    #  'drums_gain',
+                    #  'drums_channelswap',
+                    #  'other_name',
+                    #  'other_start_sec',
+                    #  'other_gain',
+                    #  'other_channelswap']
+                    pass
+                else:
+                    random_list.append(line)
+    # save wave files of ozone_train_random,
+    # which is the limiter-applied version of 4-sec 300,000 segments randomly created from musdb-HQ train subset
+    for random_song in tqdm.tqdm(random_list):
+        audio_sources = []
+        for k, source in enumerate(sources):
+            audio, sr = librosa.load(
+                f"{args.root}/train/{random_song[3 + k * 4]}/{source}.wav",
+                sr=44100,
+                mono=False,
+                offset=float(random_song[4 + k * 4]),  # 'inst_start_sec'
+                duration=4.0,
+            )
+            audio = audio * float(random_song[5 + k * 4])  # 'inst_gain'
+            if random_song[6 + k * 4].lower() == "true":  # 'inst_channelswap'
+                audio = np.flip(audio, axis=0)
+            audio_sources.append(audio)
+        stems = np.stack(audio_sources, axis=0)
+        mixture = stems.sum(0)
+        ozone_mixture, sr = librosa.load(
+            f"{args.musdb_XL_train_root}/ozone_train_random/{random_song[0]}.wav",
+            sr=44100,
+            mono=False,
+        )
+        mixture[mixture == 0.0] = np.finfo(np.float32).eps  # to avoid 'divided by zero'
+        ratio = ozone_mixture / mixture
+        np.save(
+            f"{args.output}/ozone_train_random/{random_song[0]}.npy",
+            ratio.astype(np.float16),  # 16bit is enough...
+        )
+if __name__ == "__main__":
+    main()

prepro/save_musdb_XL_train_wave.py ADDED Viewed

	@@ -0,0 +1,145 @@

+# Save musdb-XL-train dataset from numpy
+import os
+import glob
+import argparse
+import csv
+import numpy as np
+import librosa
+import soundfile as sf
+import tqdm
+def main():
+    parser = argparse.ArgumentParser(
+        description="Save musdb-XL-train wave files from the downloaded sample-wise gain parameters"
+    )
+    parser.add_argument(
+        "--root",
+        type=str,
+        default="/path/to/musdb18hq",
+        help="Root directory",
+    )
+    parser.add_argument(
+        "--musdb_XL_train_npy_root",
+        type=str,
+        default="/path/to/musdb-XL-train",
+        help="Directory of numpy arrays of musdb-XL-train's sample-wise ratio ",
+    )
+    parser.add_argument(
+        "--output",
+        type=str,
+        default="/path/to/musdb-XL-train",
+        help="Directory to save musdb-XL-train wave data",
+    )
+    args = parser.parse_args()
+    sources = ["vocals", "bass", "drums", "other"]
+    path_csv_fixed = f"{args.musdb_XL_train_npy_root}/ozone_train_fixed.csv"
+    list_path_csv_random = sorted(
+        glob.glob(f"{args.musdb_XL_train_npy_root}/ozone_train_random_*.csv")
+    )
+    # read ozone_train_fixed list
+    fixed_list = []
+    os.makedirs(f"{args.output}/ozone_train_fixed", exist_ok=True)
+    with open(path_csv_fixed, "r", encoding="utf-8") as f:
+        rdr = csv.reader(f)
+        for k, line in enumerate(rdr):
+            if k == 0:  # song_name, max_threshold, max_character
+                pass
+            else:
+                fixed_list.append(line)
+    # save wave files of ozone_train_fixed,
+    # which is the limiter-applied version of 100 songs from musdb-HQ train set
+    for fixed_song in tqdm.tqdm(fixed_list):
+        audio_sources = []
+        for source in sources:
+            audio, sr = librosa.load(
+                f"{args.root}/train/{fixed_song[0]}/{source}.wav", sr=44100, mono=False
+            )
+            audio_sources.append(audio)
+        stems = np.stack(audio_sources, axis=0)
+        mixture = stems.sum(0)
+        ratio = np.load(
+            f"{args.musdb_XL_train_npy_root}/np_ratio/ozone_train_fixed/{fixed_song[0]}.npy"
+        )
+        output = mixture * ratio
+        sf.write(
+            f"{args.output}/ozone_train_fixed/{fixed_song[0]}.wav",
+            output.T,
+            44100,
+            subtype="PCM_16",
+        )
+    # read ozone_train_random list
+    random_list = []
+    os.makedirs(f"{args.output}/ozone_train_random", exist_ok=True)
+    for path_csv_random in list_path_csv_random:
+        with open(path_csv_random, "r", encoding="utf-8") as f:
+            rdr = csv.reader(f)
+            for k, line in enumerate(rdr):
+                if k == 0:
+                    # ['song_name',
+                    #  'max_threshold',
+                    #  'max_character',
+                    #  'vocals_name',
+                    #  'vocals_start_sec',
+                    #  'vocals_gain',
+                    #  'vocals_channelswap',
+                    #  'bass_name',
+                    #  'bass_start_sec',
+                    #  'bass_gain',
+                    #  'bass_channelswap',
+                    #  'drums_name',
+                    #  'drums_start_sec',
+                    #  'drums_gain',
+                    #  'drums_channelswap',
+                    #  'other_name',
+                    #  'other_start_sec',
+                    #  'other_gain',
+                    #  'other_channelswap']
+                    pass
+                else:
+                    random_list.append(line)
+    # save wave files of ozone_train_random,
+    # which is the limiter-applied version of 4-sec 300,000 segments randomly created from musdb-HQ train subset
+    for random_song in tqdm.tqdm(random_list):
+        audio_sources = []
+        for k, source in enumerate(sources):
+            audio, sr = librosa.load(
+                f"{args.root}/train/{random_song[3 + k * 4]}/{source}.wav",
+                sr=44100,
+                mono=False,
+                offset=float(random_song[4 + k * 4]),  # 'inst_start_sec'
+                duration=4.0,
+            )
+            audio = audio * float(random_song[5 + k * 4])  # 'inst_gain'
+            if random_song[6 + k * 4].lower() == "true":  # 'inst_channelswap'
+                audio = np.flip(audio, axis=0)
+            audio_sources.append(audio)
+        stems = np.stack(audio_sources, axis=0)
+        mixture = stems.sum(0)
+        ratio = np.load(
+            f"{args.musdb_XL_train_npy_root}/np_ratio/ozone_train_random/{random_song[0]}.npy"
+        )
+        output = mixture * ratio
+        sf.write(
+            f"{args.output}/ozone_train_random/{random_song[0]}.wav",
+            output.T,
+            44100,
+            subtype="PCM_16",
+        )
+if __name__ == "__main__":
+    main()