Spaces:

krislette
/

bach-or-bot

Sleeping

App Files Files Community

krislette commited on Oct 12, 2025

Commit

e26dafd

1 Parent(s): e1ee8d1

Auto-deploy from GitHub: af943986a2919fba83018f48c4261db4a72f4cee

Browse files

Files changed (3) hide show

src/musiclime/factorization.py +6 -0
src/preprocessing/audio_preprocessor.py +3 -3
src/spectttra/spectttra_trainer.py +18 -8

src/musiclime/factorization.py CHANGED Viewed

@@ -128,16 +128,22 @@ class OpenUnmixFactorization:
         # Specify targets
         targets = ["vocals", "bass", "drums", "other"]
         # Then load openunmix files to openunmix' method
         prediction = predict.separate(
             torch.as_tensor(waveform).float(),
             rate=44100,
             model_str_or_path=model_path,
             targets=targets,
         )
         components = [prediction[key][0].mean(dim=0).numpy() for key in prediction]
         names = list(prediction.keys())
         return components, names
     def _prepare_temporal_components(self):

         # Specify targets
         targets = ["vocals", "bass", "drums", "other"]
+        # Specify device based on availability
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        print(f"[MusicLIME] Using device for source separation: {device}")
         # Then load openunmix files to openunmix' method
         prediction = predict.separate(
             torch.as_tensor(waveform).float(),
             rate=44100,
             model_str_or_path=model_path,
             targets=targets,
+            device=device,
         )
         components = [prediction[key][0].mean(dim=0).numpy() for key in prediction]
         names = list(prediction.keys())
         return components, names
     def _prepare_temporal_components(self):

src/preprocessing/audio_preprocessor.py CHANGED Viewed

@@ -259,14 +259,14 @@ class AudioPreprocessor:
         """
         waveform, sample_rate = self.load_audio(file)
-        # Resample the audio to 16kHz
-        waveform = self.resample_audio(original_sr=sample_rate, waveform=waveform)
         # Convert the audio into mono
         if waveform.shape[0] > 1:
             # print("Current audio is stereo. Converting to mono.")
             waveform = waveform.mean(dim=0, keepdim=True)
         # If there is a skip value provided, trim it
         if skip_time is not None and skip_time > 0:
             # print(f"Skipping first {skip_time:.2f} seconds.")

         """
         waveform, sample_rate = self.load_audio(file)
         # Convert the audio into mono
         if waveform.shape[0] > 1:
             # print("Current audio is stereo. Converting to mono.")
             waveform = waveform.mean(dim=0, keepdim=True)
+        # Resample the audio to 16kHz
+        waveform = self.resample_audio(original_sr=sample_rate, waveform=waveform)
         # If there is a skip value provided, trim it
         if skip_time is not None and skip_time > 0:
             # print(f"Skipping first {skip_time:.2f} seconds.")

src/spectttra/spectttra_trainer.py CHANGED Viewed

@@ -4,7 +4,11 @@ import numpy as np
 from types import SimpleNamespace
 from src.spectttra.feature import FeatureExtractor
-from src.spectttra.spectttra import SpecTTTra, build_spectttra_from_cfg, load_frozen_spectttra
 # Shared variables for the model and setup, loaded only once and reused (cache)
 _PREDICTOR_LOCK = threading.Lock()
@@ -19,7 +23,9 @@ def build_spectttra(cfg, device):
     Wrapper that builds SpecTTTra + FeatureExtractor and loads frozen checkpoint.
     """
     feat_ext, model = build_spectttra_from_cfg(cfg, device)
-    model = load_frozen_spectttra(model, "models/spectttra/spectttra_frozen.pth", device)
     return feat_ext, model
@@ -107,7 +113,7 @@ def spectttra_predict(audio_tensor):
     cfg = _CFG
     # Move waveform to device but keep float for mel extraction
-    waveform = audio_tensor.to(device).float()
     with torch.no_grad():
         # Extract mel-spectrogram
@@ -162,17 +168,21 @@ def spectttra_train(audio_tensors):
     # Refactors the loop to be a much faster single-batch operation
     try:
-        waveforms_batch = torch.cat(audio_tensors, dim=0).to(device).float()
     except Exception as e:
-        print(f"[INFO] Error during tensor concatenation, falling back to loop. Fix preprocessing for speed. Error: {e}")
         batch_list = [spectttra_predict(w) for w in audio_tensors]
         return np.array(batch_list)
     with torch.no_grad():
         melspec = feat_ext(waveforms_batch)
-        # Ensure melspec shape matches model's expectation
-        expected_frames = model.input_temp_dim # expected_frames is 3744
         if melspec.shape[2] > expected_frames:
             melspec = melspec[:, :, :expected_frames]
         elif melspec.shape[2] < expected_frames:
@@ -187,4 +197,4 @@ def spectttra_train(audio_tensors):
             tokens = model(melspec)
             pooled = tokens.mean(dim=1)
-    return pooled.cpu().numpy()

 from types import SimpleNamespace
 from src.spectttra.feature import FeatureExtractor
+from src.spectttra.spectttra import (
+    SpecTTTra,
+    build_spectttra_from_cfg,
+    load_frozen_spectttra,
+)
 # Shared variables for the model and setup, loaded only once and reused (cache)
 _PREDICTOR_LOCK = threading.Lock()
     Wrapper that builds SpecTTTra + FeatureExtractor and loads frozen checkpoint.
     """
     feat_ext, model = build_spectttra_from_cfg(cfg, device)
+    model = load_frozen_spectttra(
+        model, "models/spectttra/spectttra_frozen.pth", device
+    )
     return feat_ext, model
     cfg = _CFG
     # Move waveform to device but keep float for mel extraction
+    waveform = audio_tensor.to(device, dtype=torch.float32)
     with torch.no_grad():
         # Extract mel-spectrogram
     # Refactors the loop to be a much faster single-batch operation
     try:
+        waveforms_batch = torch.cat(audio_tensors, dim=0).to(
+            device, dtype=torch.float32
+        )
     except Exception as e:
+        print(
+            f"[INFO] Error during tensor concatenation, falling back to loop. Fix preprocessing for speed. Error: {e}"
+        )
         batch_list = [spectttra_predict(w) for w in audio_tensors]
         return np.array(batch_list)
     with torch.no_grad():
         melspec = feat_ext(waveforms_batch)
+        # Ensure melspec shape matches model's expectation
+        expected_frames = model.input_temp_dim  # expected_frames is 3744
         if melspec.shape[2] > expected_frames:
             melspec = melspec[:, :, :expected_frames]
         elif melspec.shape[2] < expected_frames:
             tokens = model(melspec)
             pooled = tokens.mean(dim=1)
+    return pooled.cpu().numpy()