Spaces:

masszhou
/

bgmseparatorgpu

Sleeping

App Files Files Community

masszhou commited on Apr 13, 2025

Commit

1218764

1 Parent(s): 1e9f689

Add application file

Browse files

Files changed (1) hide show

app.py +104 -146

app.py CHANGED Viewed

@@ -71,21 +71,19 @@ def convert_to_stereo_and_wav(audio_path: Path) -> Path:
         return stereo_path
     else:
         return Path(audio_path)
 class MDXModel:
-    def __init__(
-        self,
-        device,
-        dim_f,
-        dim_t,
-        n_fft,
-        hop=1024,
-        stem_name=None,
-        compensation=1.000,
-    ):
-        self.dim_f = dim_f
-        self.dim_t = dim_t
         self.dim_c = 4
         self.n_fft = n_fft
         self.hop = hop
@@ -105,6 +103,9 @@ class MDXModel:
         ).to(device)
     def stft(self, x):
         x = x.reshape([-1, self.chunk_size])
         x = torch.stft(
             x,
@@ -122,6 +123,9 @@ class MDXModel:
         return x[:, :, : self.dim_f]
     def istft(self, x, freq_pad=None):
         freq_pad = (
             self.freq_pad.repeat([x.shape[0], 1, 1, 1])
             if freq_pad is None
@@ -143,17 +147,15 @@ class MDXModel:
             center=True,
         )
         return x.reshape([-1, 2, self.chunk_size])
 class MDX:
-    DEFAULT_SR = 44100
     # Unit: seconds
     DEFAULT_CHUNK_SIZE = 0 * DEFAULT_SR
     DEFAULT_MARGIN_SIZE = 1 * DEFAULT_SR
-    def __init__(
-        self, model_path: str, params: MDXModel, processor=0
-    ):
         # Set the device and the provider (CPU or CUDA)
         self.device = (
             torch.device(f"cuda:{processor}")
@@ -182,7 +184,7 @@ class MDX:
         self.prog = None
     @staticmethod
-    def get_hash(model_path):
         try:
             with open(model_path, "rb") as f:
                 f.seek(-10000 * 1024, 2)
@@ -193,20 +195,21 @@ class MDX:
         return model_hash
     @staticmethod
-    def segment(
-        wave,
-        combine=True,
-        chunk_size=DEFAULT_CHUNK_SIZE,
-        margin_size=DEFAULT_MARGIN_SIZE,
-    ):
         """
         Segment or join segmented wave array
         Args:
             wave: (np.array) Wave array to be segmented or joined
             combine: (bool) If True, combines segmented wave array.
                 If False, segments wave array.
             chunk_size: (int) Size of each segment (in samples)
             margin_size: (int) Size of margin between segments (in samples)
         Returns:
             numpy array: Segmented or joined wave array
         """
@@ -251,11 +254,13 @@ class MDX:
         return processed_wave
-    def pad_wave(self, wave):
         """
         Pad the wave array to match the required chunk size
         Args:
             wave: (np.array) Wave array to be padded
         Returns:
             tuple: (padded_wave, pad, trim)
                 - padded_wave: Padded wave array
@@ -283,21 +288,21 @@ class MDX:
             waves = np.array(wave_p[:, i:i + self.model.chunk_size])
             mix_waves.append(waves)
-        mix_waves = torch.tensor(mix_waves, dtype=torch.float32).to(
-            self.device
-        )
         return mix_waves, pad, trim
-    def _process_wave(self, mix_waves, trim, pad, q: queue.Queue, _id: int):
         """
         Process each wave segment in a multi-threaded environment
         Args:
             mix_waves: (torch.Tensor) Wave segments to be processed
             trim: (int) Number of samples trimmed during padding
             pad: (int) Number of samples padded during padding
             q: (queue.Queue) Queue to hold the processed wave segments
             _id: (int) Identifier of the processed wave segment
         Returns:
             numpy array: Processed wave segment
         """
@@ -323,12 +328,14 @@ class MDX:
         q.put({_id: processed_signal})
         return processed_signal
-    def process_wave(self, wave: np.array, mt_threads=1):
         """
         Process the wave array in a multi-threaded environment
         Args:
             wave: (np.array) Wave array to be processed
             mt_threads: (int) Number of threads to be used for processing
         Returns:
             numpy array: Processed wave array
         """
@@ -367,21 +374,17 @@ class MDX:
 @spaces.GPU()
-def run_mdx(
-    model_params,
-    output_dir,
-    model_path,
-    filename,
-    exclude_main=False,
-    exclude_inversion=False,
-    suffix=None,
-    invert_suffix=None,
-    denoise=False,
-    keep_orig=True,
-    m_threads=2,
-    device_base="cuda",
-):
     if device_base == "cuda":
         device = torch.device("cuda:0")
         processor_num = 0
@@ -392,8 +395,9 @@ def run_mdx(
         device = torch.device("cpu")
         processor_num = -1
         m_threads = 1
-    model_hash = MDX.get_hash(model_path)
     mp = model_params.get(model_hash)
     model = MDXModel(
         device,
@@ -405,51 +409,26 @@ def run_mdx(
     )
     mdx_sess = MDX(model_path, model, processor=processor_num)
-    wave, sr = librosa.load(filename, mono=False, sr=44100)
     # normalizing input wave gives better output
     peak = max(np.max(wave), abs(np.min(wave)))
     wave /= peak
     if denoise:
-        wave_processed = -(mdx_sess.process_wave(-wave, m_threads)) + (
-            mdx_sess.process_wave(wave, m_threads)
-        )
         wave_processed *= 0.5
     else:
         wave_processed = mdx_sess.process_wave(wave, m_threads)
     # return to previous peak
     wave_processed *= peak
-    stem_name = model.stem_name if suffix is None else suffix
-    main_filepath = None
-    if not exclude_main:
-        main_filepath = os.path.join(
-            output_dir,
-            f"{os.path.basename(os.path.splitext(filename)[0])}_{stem_name}.wav",
-        )
-        sf.write(main_filepath, wave_processed.T, sr)
-    invert_filepath = None
-    if not exclude_inversion:
-        diff_stem_name = (
-            stem_naming.get(stem_name)
-            if invert_suffix is None
-            else invert_suffix
-        )
-        stem_name = (
-            f"{stem_name}_diff" if diff_stem_name is None else diff_stem_name
-        )
-        invert_filepath = os.path.join(
-            output_dir,
-            f"{os.path.basename(os.path.splitext(filename)[0])}_{stem_name}.wav",
-        )
-        sf.write(
-            invert_filepath,
-            (-wave_processed.T * model.compensation) + wave.T,
-            sr,
-        )
-    if not keep_orig:
-        os.remove(filename)
     del mdx_sess, wave_processed, wave
     gc.collect()
@@ -457,31 +436,30 @@ def run_mdx(
     return main_filepath, invert_filepath
-def run_mdx_beta(
-    model_params,
-    output_dir,
-    model_path,
-    filename,
-    exclude_main=False,
-    exclude_inversion=False,
-    suffix=None,
-    invert_suffix=None,
-    denoise=False,
-    keep_orig=True,
-    m_threads=2,
-    device_base="",
-):
-    m_threads = 1
-    duration = librosa.get_duration(filename=filename)
-    if duration >= 60 and duration <= 120:
-        m_threads = 8
-    elif duration > 120:
-        m_threads = 16
-    model_hash = MDX.get_hash(model_path)
-    device = torch.device("cpu")
-    processor_num = -1
     mp = model_params.get(model_hash)
     model = MDXModel(
         device,
@@ -493,56 +471,26 @@ def run_mdx_beta(
     )
     mdx_sess = MDX(model_path, model, processor=processor_num)
-    wave, sr = librosa.load(filename, mono=False, sr=44100)
     # normalizing input wave gives better output
     peak = max(np.max(wave), abs(np.min(wave)))
     wave /= peak
     if denoise:
-        wave_processed = -(mdx_sess.process_wave(-wave, m_threads)) + (
-            mdx_sess.process_wave(wave, m_threads)
-        )
         wave_processed *= 0.5
     else:
         wave_processed = mdx_sess.process_wave(wave, m_threads)
     # return to previous peak
     wave_processed *= peak
-    stem_name = model.stem_name if suffix is None else suffix
-    main_filepath = None
-    if not exclude_main:
-        main_filepath = os.path.join(
-            output_dir,
-            f"{os.path.basename(os.path.splitext(filename)[0])}_{stem_name}.wav",
-        )
-        sf.write(main_filepath, wave_processed.T, sr)
-    invert_filepath = None
-    if not exclude_inversion:
-        diff_stem_name = (
-            stem_naming.get(stem_name)
-            if invert_suffix is None
-            else invert_suffix
-        )
-        stem_name = (
-            f"{stem_name}_diff" if diff_stem_name is None else diff_stem_name
-        )
-        invert_filepath = os.path.join(
-            output_dir,
-            f"{os.path.basename(os.path.splitext(filename)[0])}_{stem_name}.wav",
-        )
-        sf.write(
-            invert_filepath,
-            (-wave_processed.T * model.compensation) + wave.T,
-            sr,
-        )
-    if not keep_orig:
-        os.remove(filename)
-    del mdx_sess, wave_processed, wave
-    gc.collect()
-    torch.cuda.empty_cache()
-    return main_filepath, invert_filepath
 def extract_bgm(mdx_model_params: Dict,
@@ -592,10 +540,20 @@ def extract_vocal(mdx_model_params: Dict,
                                                        device_base=device_base,
                                                        )
         vocals_path = main_vocals_path
     return vocals_path
 def process_uvr_task(input_file_path: Path,
                      output_dir: Path,
                      models_path: Dict[str, Path],

         return stereo_path
     else:
         return Path(audio_path)
 class MDXModel:
+    def __init__(self,
+                 device: torch.device,
+                 dim_f: int,
+                 dim_t: int,
+                 n_fft: int,
+                 hop: int = 1024,
+                 stem_name: str = "Vocals",
+                 compensation: float = 1.000,):
+        self.dim_f = dim_f  # frequency bins
+        self.dim_t = dim_t
         self.dim_c = 4
         self.n_fft = n_fft
         self.hop = hop
         ).to(device)
     def stft(self, x):
+        """
+        computes the Fourier transform of short overlapping windows of the input
+        """
         x = x.reshape([-1, self.chunk_size])
         x = torch.stft(
             x,
         return x[:, :, : self.dim_f]
     def istft(self, x, freq_pad=None):
+        """
+        computes the inverse Fourier transform of short overlapping windows of the input
+        """
         freq_pad = (
             self.freq_pad.repeat([x.shape[0], 1, 1, 1])
             if freq_pad is None
             center=True,
         )
         return x.reshape([-1, 2, self.chunk_size])
 class MDX:
+    DEFAULT_SR = 44100  # unit: Hz
     # Unit: seconds
     DEFAULT_CHUNK_SIZE = 0 * DEFAULT_SR
     DEFAULT_MARGIN_SIZE = 1 * DEFAULT_SR
+    def __init__(self, model_path: Path, params: MDXModel, processor: int = 0):
         # Set the device and the provider (CPU or CUDA)
         self.device = (
             torch.device(f"cuda:{processor}")
         self.prog = None
     @staticmethod
+    def get_hash(model_path: Path) -> str:
         try:
             with open(model_path, "rb") as f:
                 f.seek(-10000 * 1024, 2)
         return model_hash
     @staticmethod
+    def segment(wave: np.array,
+                combine: bool = True,
+                chunk_size: int = DEFAULT_CHUNK_SIZE,
+                margin_size: int = DEFAULT_MARGIN_SIZE,
+                ) -> np.array:
         """
         Segment or join segmented wave array
         Args:
             wave: (np.array) Wave array to be segmented or joined
             combine: (bool) If True, combines segmented wave array.
                 If False, segments wave array.
             chunk_size: (int) Size of each segment (in samples)
             margin_size: (int) Size of margin between segments (in samples)
         Returns:
             numpy array: Segmented or joined wave array
         """
         return processed_wave
+    def pad_wave(self, wave: np.array) -> Tuple[np.array, int, int]:
         """
         Pad the wave array to match the required chunk size
         Args:
             wave: (np.array) Wave array to be padded
         Returns:
             tuple: (padded_wave, pad, trim)
                 - padded_wave: Padded wave array
             waves = np.array(wave_p[:, i:i + self.model.chunk_size])
             mix_waves.append(waves)
+        mix_waves = torch.tensor(np.array(mix_waves), dtype=torch.float32).to(self.device)
         return mix_waves, pad, trim
+    def _process_wave(self, mix_waves, trim, pad, q: queue.Queue, _id: int) -> np.array:
         """
         Process each wave segment in a multi-threaded environment
         Args:
             mix_waves: (torch.Tensor) Wave segments to be processed
             trim: (int) Number of samples trimmed during padding
             pad: (int) Number of samples padded during padding
             q: (queue.Queue) Queue to hold the processed wave segments
             _id: (int) Identifier of the processed wave segment
         Returns:
             numpy array: Processed wave segment
         """
         q.put({_id: processed_signal})
         return processed_signal
+    def process_wave(self, wave: np.array, mt_threads=1) -> np.array:
         """
         Process the wave array in a multi-threaded environment
         Args:
             wave: (np.array) Wave array to be processed
             mt_threads: (int) Number of threads to be used for processing
         Returns:
             numpy array: Processed wave array
         """
 @spaces.GPU()
+def run_mdx(model_params: Dict,
+            input_filename: Path,
+            output_dir: Path,
+            model_path: Path,
+            denoise: bool = False,
+            m_threads: int = 2,
+            device_base: str = "cuda",
+            ) -> Tuple[str, str]:
+    """
+    Separate vocals using MDX model
+    """
     if device_base == "cuda":
         device = torch.device("cuda:0")
         processor_num = 0
         device = torch.device("cpu")
         processor_num = -1
         m_threads = 1
+    print(f"device: {device}")
+    model_hash = MDX.get_hash(model_path)  # type: str
     mp = model_params.get(model_hash)
     model = MDXModel(
         device,
     )
     mdx_sess = MDX(model_path, model, processor=processor_num)
+    wave, sr = librosa.load(input_filename, mono=False, sr=44100)
     # normalizing input wave gives better output
     peak = max(np.max(wave), abs(np.min(wave)))
     wave /= peak
     if denoise:
+        wave_processed = -(mdx_sess.process_wave(-wave, m_threads)) + (mdx_sess.process_wave(wave, m_threads))  # type: np.array
         wave_processed *= 0.5
     else:
         wave_processed = mdx_sess.process_wave(wave, m_threads)
     # return to previous peak
     wave_processed *= peak
+    stem_name = model.stem_name
+    # output main track
+    main_filepath = output_dir / input_filename.with_name(f"{input_filename.stem}_{stem_name}.wav")
+    sf.write(main_filepath, wave_processed.T, sr)
+    # output reverse track
+    invert_filepath = output_dir / input_filename.with_name(f"{input_filename.stem}_{stem_name}_reverse.wav")
+    sf.write(invert_filepath, (-wave_processed.T * model.compensation) + wave.T, sr)
     del mdx_sess, wave_processed, wave
     gc.collect()
     return main_filepath, invert_filepath
+@spaces.GPU()
+def run_mdx_return_np(model_params: Dict,
+                      input_filename: Path,
+                      model_path: Path,
+                      denoise: bool = False,
+                      m_threads: int = 2,
+                      device_base: str = "cuda",
+                      ) -> Tuple[np.ndarray, np.ndarray]:
+    """
+    Separate vocals using MDX model
+    """
+    if device_base == "cuda":
+        device = torch.device("cuda:0")
+        processor_num = 0
+        device_properties = torch.cuda.get_device_properties(device)
+        vram_gb = device_properties.total_memory / 1024**3
+        m_threads = 1 if vram_gb < 8 else (8 if vram_gb > 32 else 2)
+    else:
+        device = torch.device("cpu")
+        processor_num = -1
+        m_threads = 1
+    print(f"device: {device}")
+    model_hash = MDX.get_hash(model_path)  # type: str
     mp = model_params.get(model_hash)
     model = MDXModel(
         device,
     )
     mdx_sess = MDX(model_path, model, processor=processor_num)
+    wave, sr = librosa.load(input_filename, mono=False, sr=44100)
     # normalizing input wave gives better output
     peak = max(np.max(wave), abs(np.min(wave)))
     wave /= peak
     if denoise:
+        wave_processed = -(mdx_sess.process_wave(-wave, m_threads)) + (mdx_sess.process_wave(wave, m_threads))  # type: np.array
         wave_processed *= 0.5
     else:
         wave_processed = mdx_sess.process_wave(wave, m_threads)
     # return to previous peak
     wave_processed *= peak
+    stem_name = model.stem_name
+    # output main track
+    main_track = wave_processed.T
+    # output reverse track
+    invert_track = (-wave_processed.T * model.compensation) + wave.T
+    return main_track, invert_track
 def extract_bgm(mdx_model_params: Dict,
                                                        device_base=device_base,
                                                        )
         vocals_path = main_vocals_path
+    # If "dereverb_flag" is enabled, use Reverb_HQ_By_FoxJoy.onnx for dereverberation
+    # deactived since Model license unknown
+    # if dereverb_flag:
+    #     time.sleep(2)
+    #     _, vocals_dereverb_path = run_mdx(mdx_model_params,
+    #                                       output_dir,
+    #                                       mdxnet_models_dir/"Reverb_HQ_By_FoxJoy.onnx",
+    #                                       vocals_path,
+    #                                       denoise=True,
+    #                                       device_base=device_base,
+    #                                       )
+    #     vocals_path = vocals_dereverb_path
     return vocals_path
 def process_uvr_task(input_file_path: Path,
                      output_dir: Path,
                      models_path: Dict[str, Path],