Upload folder using huggingface_hub

Browse files

Files changed (8) hide show

.gitignore +2 -0
MelBandRoformer.py +293 -0
config.json +0 -0
gradio_app.py +87 -0
main.py +119 -0
mel_band_roformer.axmodel +3 -0
requirements.txt +9 -0
screenshot.png +0 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ __pycache__
2	+ *.wav

MelBandRoformer.py ADDED Viewed

	@@ -0,0 +1,293 @@

+import axengine as axe
+import numpy as np
+import soundfile as sf
+import librosa
+import torch
+import tqdm
+from librosa import filters
+from einops import rearrange, reduce, repeat
+from typing import Union
+class MelBandRoformer:
+    def __init__(
+        self,
+        model_path,
+        *,
+        stft_n_fft=2048,
+        stft_win_length=2048,
+        stft_hop_length=441,
+        stft_normalized=False,
+        sample_rate=44100,
+        num_bands=60,
+        stereo=True
+    ):
+        self.stft_kwargs = dict(
+            n_fft=stft_n_fft,
+            hop_length=stft_hop_length,
+            win_length=stft_win_length,
+            normalized=stft_normalized,
+        )
+        self.sample_rate = sample_rate
+        self.num_bands = num_bands
+        self.stereo = stereo
+        self.num_channels = 2 if stereo else 1
+        self.freq_indices, _, _, self.num_bands_per_freq = self.calc_freq_indices()
+        self.model = axe.InferenceSession(
+            model_path,
+            providers=["AxEngineExecutionProvider", "AXCLRTExecutionProvider"],
+        )
+    def calc_freq_indices(self):
+        freqs = torch.stft(
+            torch.randn(1, 4096),
+            **self.stft_kwargs,
+            window=torch.ones(self.stft_kwargs["n_fft"]),
+            return_complex=True
+        ).shape[1]
+        # create mel filter bank
+        # with librosa.filters.mel as in section 2 of paper
+        mel_filter_bank_numpy = filters.mel(
+            sr=self.sample_rate, n_fft=self.stft_kwargs["n_fft"], n_mels=self.num_bands
+        )
+        mel_filter_bank = torch.from_numpy(mel_filter_bank_numpy)
+        # for some reason, it doesn't include the first freq? just force a value for now
+        mel_filter_bank[0][0] = 1.0
+        # In some systems/envs we get 0.0 instead of ~1.9e-18 in the last position,
+        # so let's force a positive value
+        mel_filter_bank[-1, -1] = 1.0
+        # binary as in paper (then estimated masks are averaged for overlapping regions)
+        freqs_per_band = mel_filter_bank > 0
+        assert freqs_per_band.any(
+            dim=0
+        ).all(), "all frequencies need to be covered by all bands for now"
+        repeated_freq_indices = repeat(
+            torch.arange(freqs), "f -> b f", b=self.num_bands
+        )
+        freq_indices = repeated_freq_indices[freqs_per_band]
+        if self.stereo:
+            freq_indices = repeat(freq_indices, "f -> f s", s=2)
+            freq_indices = freq_indices * 2 + torch.arange(2)
+            freq_indices = rearrange(freq_indices, "f s -> (f s)")
+        num_freqs_per_band = reduce(freqs_per_band, "b f -> b", "sum")
+        num_bands_per_freq = reduce(freqs_per_band, "b f -> f", "sum")
+        return freq_indices, freqs_per_band, num_freqs_per_band, num_bands_per_freq
+    def infer(
+        self, audio: Union[str, np.ndarray], chunk_size=88200, overlap=0.25, num_stems=4
+    ):
+        if isinstance(audio, str):
+            wav, _ = librosa.load(audio, sr=self.sample_rate, mono=not self.stereo)
+        else:
+            wav = audio
+        if self.stereo and wav.shape[0] != 2:
+            wav = wav.transpose()
+        ref = wav.mean(0)
+        ref_mean = ref.mean()
+        ref_std = ref.std()
+        preprocessed_wav = (wav - ref_mean) / (ref_std + 1e-8)
+        out = self.apply_model(
+            self.model,
+            preprocessed_wav[None],
+            self.freq_indices,
+            self.num_bands_per_freq,
+            segment=chunk_size,
+            overlap=overlap,
+            len_model_sources=num_stems,
+        )
+        out *= ref_std + 1e-8
+        out += ref_mean
+        return out
+    def preprocess(self, mix):
+        device = torch.device("cpu")
+        if isinstance(mix, np.ndarray):
+            mix = torch.from_numpy(mix)
+        b, c, l = mix.shape
+        mix = mix.view(-1, l)
+        stft_window = torch.hann_window(self.stft_kwargs["win_length"], device=device)
+        stft_repr = torch.stft(
+            mix, **self.stft_kwargs, window=stft_window, return_complex=True
+        )
+        stft_repr = torch.view_as_real(stft_repr)
+        # print(f"stft_repr.shape: {stft_repr.shape}")
+        # stft_repr = unpack_one(stft_repr, batch_audio_channel_packed_shape, '* f t c')
+        # merge stereo / mono into the frequency, with frequency leading dimension, for band splitting
+        # stft_repr = rearrange(stft_repr,'b s f t c -> b (f s) t c')
+        s, f, t, c = stft_repr.shape
+        stft_repr = (
+            stft_repr.unsqueeze(0)
+            .reshape(b, s, f, t, c)
+            .transpose(2, 1)
+            .reshape(b, -1, t, c)
+        )
+        return stft_repr.numpy()
+    def postprocess(
+        self,
+        masks,
+        stft_repr,
+        freq_indices,
+        num_bands_per_freq,
+        audio_len,
+        num_stems=4,
+        channels=2,
+    ):
+        masks = torch.from_numpy(masks)
+        stft_repr = torch.from_numpy(stft_repr)
+        batch = 1
+        istft_length = audio_len
+        device = torch.device("cpu")
+        stft_window = torch.hann_window(self.stft_kwargs["win_length"], device=device)
+        # modulate frequency representation
+        stft_repr = rearrange(stft_repr, "b f t c -> b 1 f t c")
+        # complex number multiplication
+        stft_repr = torch.view_as_complex(stft_repr)
+        masks = torch.view_as_complex(masks)
+        masks = masks.type(stft_repr.dtype)
+        # need to average the estimated mask for the overlapped frequencies
+        scatter_indices = repeat(
+            freq_indices, "f -> b n f t", b=batch, n=num_stems, t=stft_repr.shape[-1]
+        )
+        stft_repr_expanded_stems = repeat(stft_repr, "b 1 ... -> b n ...", n=num_stems)
+        masks_summed = torch.zeros_like(stft_repr_expanded_stems).scatter_add_(
+            2, scatter_indices, masks
+        )
+        denom = repeat(num_bands_per_freq, "f -> (f r) 1", r=channels)
+        # print(f"stft_repr.shape: {stft_repr.shape}")
+        # print(f"stft_repr_expanded_stems.shape: {stft_repr_expanded_stems.shape}")
+        # print(f"masks_summed.shape: {masks_summed.shape}")
+        # print(f"denom.shape: {denom.shape}")
+        masks_averaged = masks_summed / denom.clamp(min=1e-8)
+        # modulate stft repr with estimated mask
+        stft_repr = stft_repr * masks_averaged
+        # istft
+        stft_repr = rearrange(stft_repr, "b n (f s) t -> (b n s) f t", s=2)
+        recon_audio = torch.istft(
+            stft_repr,
+            **self.stft_kwargs,
+            window=stft_window,
+            return_complex=False,
+            length=istft_length
+        )
+        recon_audio = rearrange(
+            recon_audio, "(b n s) t -> b n s t", b=batch, s=2, n=num_stems
+        )
+        if num_stems == 1:
+            recon_audio = rearrange(recon_audio, "b 1 s t -> b s t")
+        return recon_audio.numpy()
+    def apply_model(
+        self,
+        model,
+        mix,
+        freq_indices,
+        num_bands_per_freq,
+        segment,
+        overlap: float = 0.25,
+        len_model_sources=4,
+    ):
+        model_weights = [1.0] * len_model_sources
+        totals = [0.0] * len_model_sources
+        batch, channels, length = mix.shape
+        stride = int((1 - overlap) * segment)
+        futures = []
+        for offset in tqdm.tqdm(range(0, length, stride)):
+            chunk = mix[..., offset : offset + segment]
+            audio_len = chunk.shape[-1]
+            if chunk.shape[-1] < segment:
+                chunk = np.concatenate(
+                    [
+                        chunk,
+                        np.zeros(
+                            (batch, channels, segment - chunk.shape[-1]),
+                            dtype=np.float32,
+                        ),
+                    ],
+                    axis=-1,
+                )
+            stft_input = self.preprocess(chunk)
+            masks = model.run(None, {"stft_input": stft_input})[0]
+            future = self.postprocess(
+                masks,
+                stft_input,
+                freq_indices,
+                num_bands_per_freq,
+                audio_len,
+                num_stems=len_model_sources,
+            )
+            future = future[..., :audio_len]
+            futures.append((future, offset))
+        out = np.zeros((batch, len_model_sources, channels, length))
+        sum_weight = np.zeros((length,))
+        weight = np.concatenate(
+            [
+                np.arange(1, segment // 2 + 1),
+                np.arange(segment - segment // 2, 0, -1),
+            ],
+            axis=-1,
+        )
+        weight = weight / weight.max()
+        for future, offset in futures:
+            chunk_out = future
+            chunk_length = chunk_out.shape[-1]
+            out[..., offset : offset + segment] += weight[:chunk_length] * chunk_out
+            sum_weight[offset : offset + segment] += weight[:chunk_length]
+        out /= sum_weight
+        for k, inst_weight in enumerate(model_weights):
+            out[:, k, :, :] *= inst_weight
+            totals[k] += inst_weight
+        for k in range(out.shape[1]):
+            out[:, k, :, :] /= totals[k]
+        return out[0]

config.json ADDED Viewed

File without changes

gradio_app.py ADDED Viewed

	@@ -0,0 +1,87 @@

+import gradio as gr
+import numpy as np
+import soundfile as sf
+import os
+from MelBandRoformer import MelBandRoformer
+model = MelBandRoformer("./mel_band_roformer.axmodel")
+print("Load model finish")
+def cleanup_temp_files(files):
+    for file in files:
+        if os.path.exists(file):
+            if os.path.isdir(file):
+                os.system(f"rm -rf {file}")
+            else:
+                os.remove(file)
+def process_audio(input_file, pr=gr.Progress(track_tqdm=True)):
+    global model
+    output_path = "output"
+    cleanup_temp_files([output_path])
+    print("Running model")
+    out = model.infer(input_file)
+    audio_name = os.path.splitext(os.path.basename(input_file))[0]
+    os.makedirs(os.path.join(output_path, audio_name), exist_ok=True)
+    stem_names = ["drums", "bass", "other", "vocals"]
+    output_files = []
+    print("Saving audio...")
+    for i in range(out.shape[0]):
+        source = out[i]
+        source = source / max(1.01 * np.abs(source).max(), 1)
+        if source.shape[1] != 2:
+            source = source.transpose()
+        audio_path = os.path.join(
+            output_path,
+            audio_name,
+            f"{stem_names[i]}.wav",
+        )
+        print(f"Save {stem_names[i]} to {audio_path}")
+        sf.write(audio_path, source, samplerate=model.sample_rate)
+        output_files.append(audio_path)
+    return [
+        gr.Audio(output_files[0], type="filepath", sources=None, editable=False),
+        gr.Audio(output_files[1], type="filepath", sources=None, editable=False),
+        gr.Audio(output_files[2], type="filepath", sources=None, editable=False),
+        gr.Audio(output_files[3], type="filepath", sources=None, editable=False),
+    ]
+with gr.Blocks() as demo:
+    gr.Markdown("## 音轨分离")
+    gr.Markdown("上传一个 WAV 文件，模型将其分为drums、bass、other、vocal四轨，对应四种乐器")
+    audio_input = gr.Audio(type="filepath", label="上传 WAV 文件", editable=False)
+    with gr.Tab("Drums"):
+        drums_audio = gr.Audio(type="filepath", label="drums")
+    with gr.Tab("Bass"):
+        bass_audio = gr.Audio(type="filepath", label="bass")
+    with gr.Tab("Other"):
+        other_audio = gr.Audio(type="filepath", label="other")
+    with gr.Tab("Vocals"):
+        vocals_audio = gr.Audio(type="filepath", label="vocals")
+    submit_btn = gr.Button("处理音频")
+    submit_btn.click(
+        fn=process_audio,
+        inputs=[audio_input],
+        outputs=[drums_audio, bass_audio, other_audio, vocals_audio],
+    )
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0")

main.py ADDED Viewed

	@@ -0,0 +1,119 @@

+import numpy as np
+import argparse
+import os
+import soundfile as sf
+import glob
+from MelBandRoformer import MelBandRoformer
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--input_audio", "-i", type=str, required=True, help="Input audio file(.wav)"
+    )
+    parser.add_argument(
+        "--output_path",
+        "-o",
+        type=str,
+        required=False,
+        default="./output",
+        help="Seperated wav path",
+    )
+    parser.add_argument(
+        "--model_path",
+        "-m",
+        type=str,
+        required=False,
+        default="./mel_band_roformer.axmodel",
+    )
+    parser.add_argument("--overlap", type=float, required=False, default=0.25)
+    parser.add_argument(
+        "--segment",
+        type=float,
+        required=False,
+        default=88200,
+        help="num samples of model",
+    )
+    parser.add_argument(
+        "--num_stems", type=int, default=4, help="num of instruments of model"
+    )
+    parser.add_argument("--sample_rate", type=int, default=44100)
+    parser.add_argument("--n_fft", type=int, default=2048)
+    parser.add_argument("--hop_len", type=int, default=441)
+    return parser.parse_args()
+def main():
+    args = get_args()
+    assert os.path.exists(args.input_audio), f"Input audio {args.input_audio} not exist"
+    assert os.path.exists(args.model_path), f"Model {args.model_path} not exist"
+    os.makedirs(args.output_path, exist_ok=True)
+    input_audio = args.input_audio
+    output_path = args.output_path
+    model_path = args.model_path
+    segment = args.segment
+    num_stems = args.num_stems
+    target_sr = args.sample_rate
+    print(f"Input audio: {input_audio}")
+    print(f"Output path: {output_path}")
+    print(f"Model: {model_path}")
+    print(f"Overlap: {args.overlap}")
+    if os.path.isdir(input_audio):
+        types = ("*.wav", "*.mp3", "*.flac")  # the tuple of file types
+        input_audios = []
+        for files in types:
+            input_audios.extend(glob.glob(f"{input_audio}/**/{files}", recursive=True))
+    else:
+        input_audios = [input_audio]
+    mel_band = MelBandRoformer(
+        model_path,
+        stft_n_fft=args.n_fft,
+        stft_win_length=args.n_fft,
+        stft_hop_length=args.hop_len,
+        sample_rate=target_sr,
+    )
+    for input_audio in input_audios:
+        out = mel_band.infer(
+            input_audio,
+            chunk_size=segment,
+            overlap=args.overlap,
+            num_stems=num_stems,
+        )
+        audio_name = os.path.splitext(os.path.basename(input_audio))[0]
+        os.makedirs(os.path.join(output_path, audio_name), exist_ok=True)
+        stem_names = ["drums", "bass", "other", "vocals"]
+        print("Saving audio...")
+        for i in range(out.shape[0]):
+            source = out[i]
+            source = source / max(1.01 * np.abs(source).max(), 1)
+            if source.shape[1] != 2:
+                source = source.transpose()
+            if num_stems == 4:
+                audio_path = os.path.join(
+                    output_path,
+                    audio_name,
+                    f"{stem_names[i]}.wav",
+                )
+                print(f"Save {stem_names[i]} to {audio_path}")
+            else:
+                audio_path = os.path.join(
+                    output_path,
+                    audio_name,
+                    f"stem_{i}.wav",
+                )
+                print(f"Save stem {i} to {audio_path}")
+            sf.write(audio_path, source, samplerate=target_sr)
+if __name__ == "__main__":
+    main()

mel_band_roformer.axmodel ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:24a10bcac63b6a90d00de19063a20660a599b961bb56ede0089fe4bfacd464b3
+size 95657444

requirements.txt ADDED Viewed

	@@ -0,0 +1,9 @@

+numpy<2.0
+soundfile==0.13.1
+librosa==0.9.1
+tqdm
+onnxruntime
+einops
+torch
+axengine @ git+https://github.com/AXERA-TECH/pyaxengine/releases/tag/0.1.3.rc1
+gradio

screenshot.png ADDED Viewed