Spaces:

anvitax
/

wham

Sleeping

Anvita Pandit commited on Mar 6

Commit

2440fbf

1 Parent(s): 80334fc

Fall back to librosa onset detection when madmom is unavailable

madmom can't build on HF Spaces (requires Cython at build time with
--no-build-isolation). onset_mask now tries madmom first, and falls back
to librosa's onset_detect which is already installed. Removed madmom
from requirements.txt since it can't be pip-installed with build isolation.

Made-with: Cursor

Files changed (2) hide show

requirements.txt +0 -1
vampnet/vampnet/mask.py +33 -12

requirements.txt CHANGED Viewed

@@ -14,4 +14,3 @@ wavebeat @ git+https://github.com/hugofloresgarcia/wavebeat
 lac @ git+https://github.com/hugofloresgarcia/lac.git
 descript-audiotools @ git+https://github.com/hugofloresgarcia/audiotools.git
 pyharp
-madmom

 lac @ git+https://github.com/hugofloresgarcia/lac.git
 descript-audiotools @ git+https://github.com/hugofloresgarcia/audiotools.git
 pyharp

vampnet/vampnet/mask.py CHANGED Viewed

@@ -189,6 +189,26 @@ def time_stretch_mask(
     mask = periodic_mask(x, stretch_factor, width=1)
     return mask
 def onset_mask(
     sig: AudioSignal,
     z: torch.Tensor,
@@ -196,24 +216,26 @@ def onset_mask(
     width: int = 1
 ):
     import librosa
-    import madmom
-    from madmom.features.onsets import RNNOnsetProcessor, OnsetPeakPickingProcessor
     import tempfile
     import numpy as np
     with tempfile.NamedTemporaryFile(suffix='.wav') as f:
         sig = sig.clone()
         sig.write(f.name)
-        proc = RNNOnsetProcessor(online=False)
-        onsetproc = OnsetPeakPickingProcessor(threshold=0.3,
-                                              fps=sig.sample_rate/interface.codec.hop_length)
-        act = proc(f.name)
-        onset_times = onsetproc(act)
-        # convert to indices for z array
-        onset_indices = librosa.time_to_frames(onset_times, sr=sig.sample_rate, hop_length=interface.codec.hop_length)
         if onset_indices.shape[0] == 0:
             mask = empty_mask(z)
@@ -223,7 +245,6 @@ def onset_mask(
             print("onset indices: ", onset_indices)
             print("onset times: ", onset_times)
-            # create a mask, set onset
             mask = torch.ones_like(z)
             n_timesteps = z.shape[-1]

     mask = periodic_mask(x, stretch_factor, width=1)
     return mask
+def _onset_times_madmom(wav_path, sample_rate, hop_length):
+    from madmom.features.onsets import RNNOnsetProcessor, OnsetPeakPickingProcessor
+    proc = RNNOnsetProcessor(online=False)
+    onsetproc = OnsetPeakPickingProcessor(
+        threshold=0.3, fps=sample_rate / hop_length
+    )
+    act = proc(wav_path)
+    return onsetproc(act)
+def _onset_times_librosa(wav_path, sample_rate, hop_length):
+    import librosa
+    y, sr = librosa.load(wav_path, sr=sample_rate)
+    onset_env = librosa.onset.onset_strength(y=y, sr=sr, hop_length=hop_length)
+    onset_frames = librosa.onset.onset_detect(
+        onset_envelope=onset_env, sr=sr, hop_length=hop_length, backtrack=False
+    )
+    return librosa.frames_to_time(onset_frames, sr=sr, hop_length=hop_length)
 def onset_mask(
     sig: AudioSignal,
     z: torch.Tensor,
     width: int = 1
 ):
     import librosa
     import tempfile
     import numpy as np
+    try:
+        import madmom  # noqa: F401
+        _get_onset_times = _onset_times_madmom
+    except ImportError:
+        print("madmom not installed, falling back to librosa for onset detection")
+        _get_onset_times = _onset_times_librosa
+    hop_length = interface.codec.hop_length
     with tempfile.NamedTemporaryFile(suffix='.wav') as f:
         sig = sig.clone()
         sig.write(f.name)
+        onset_times = _get_onset_times(f.name, sig.sample_rate, hop_length)
+        onset_indices = librosa.time_to_frames(
+            onset_times, sr=sig.sample_rate, hop_length=hop_length
+        )
         if onset_indices.shape[0] == 0:
             mask = empty_mask(z)
             print("onset indices: ", onset_indices)
             print("onset times: ", onset_times)
             mask = torch.ones_like(z)
             n_timesteps = z.shape[-1]