demucs-cpu

Sleeping

lllindsey0615 commited on Feb 13, 2025

Commit

1035bfa

1 Parent(s): f750bef

Supports mono audio input

Files changed (1) hide show

app.py CHANGED Viewed

@@ -7,10 +7,9 @@ from pyharp import ModelCard, LabelList, build_endpoint, save_audio
 from audiotools import AudioSignal
-def separate_instrumental(audio_file_path: str,  model_name: str = 'mdx_extra_q'):
-    """
-    Separates an audio file into a instrumental stem using a Demucs model.
-    """
     # Load Demucs model
     model = pretrained.get_model(model_name)
     model.to('cuda' if torch.cuda.is_available() else 'cpu')
@@ -19,6 +18,13 @@ def separate_instrumental(audio_file_path: str,  model_name: str = 'mdx_extra_q'
     # Load audio file (waveform shape: (channels, samples))
     waveform, sr = torchaudio.load(audio_file_path)
     # Run Demucs — returns a list (batch, stems, channels, samples)
     with torch.no_grad():
         stems_batch = apply_model(
@@ -28,12 +34,15 @@ def separate_instrumental(audio_file_path: str,  model_name: str = 'mdx_extra_q'
             shifts=1,
             split=True
         )
     stems = stems_batch[0]  # Extract stems from batch
-    # Extract the vocal stem (stems[0] is vocals in most models)
     instrumental = stems[0]
     # Convert to an AudioSignal object
     instrumental_signal = AudioSignal(instrumental.cpu().numpy(), sample_rate=sr)
     return instrumental_signal

 from audiotools import AudioSignal
+DEMUX_MODELS = ["mdx_extra_q", "mdx_extra", "htdemucs", "mdx_q"]
+def separate_instrumental(audio_file_path: str, model_name: str):
     # Load Demucs model
     model = pretrained.get_model(model_name)
     model.to('cuda' if torch.cuda.is_available() else 'cpu')
     # Load audio file (waveform shape: (channels, samples))
     waveform, sr = torchaudio.load(audio_file_path)
+    # Check if the input is mono
+    is_mono = waveform.shape[0] == 1
+    # If mono, duplicate to stereo
+    if is_mono:
+        waveform = waveform.repeat(2, 1)
     # Run Demucs — returns a list (batch, stems, channels, samples)
     with torch.no_grad():
         stems_batch = apply_model(
             shifts=1,
             split=True
         )
     stems = stems_batch[0]  # Extract stems from batch
+    # Extract the instrumental stem (stems[0] is vocals in most models)
     instrumental = stems[0]
+    if is_mono:
+        instrumental = instrumental.mean(dim=0, keepdim=True)  # Stereo → Mono
     # Convert to an AudioSignal object
     instrumental_signal = AudioSignal(instrumental.cpu().numpy(), sample_rate=sr)
     return instrumental_signal