Spaces:

krislette
/

bach-or-bot

Sleeping

App Files Files Community

krislette commited on Oct 13, 2025

Commit

61f21af

1 Parent(s): 6530321

Auto-deploy from GitHub: 7c591156b27da3e33cf2a35fbb1d3fdf593c7e3f

Browse files

Files changed (3) hide show

Dockerfile +1 -0
src/musiclime/wrapper.py +12 -1
src/spectttra/spectttra_trainer.py +40 -25

Dockerfile CHANGED Viewed

@@ -51,6 +51,7 @@ ENV NUMBA_CACHE_DIR="/tmp/numba_cache"
 ENV NUMBA_DISABLE_JIT=0
 ENV MUSICLIME_NUM_SAMPLES=1000
 ENV MUSICLIME_NUM_FEATURES=10
 # Hugging Face Spaces specific, expose port 7860
 EXPOSE 7860

 ENV NUMBA_DISABLE_JIT=0
 ENV MUSICLIME_NUM_SAMPLES=1000
 ENV MUSICLIME_NUM_FEATURES=10
+ENV PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True
 # Hugging Face Spaces specific, expose port 7860
 EXPOSE 7860

src/musiclime/wrapper.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import time
 import joblib
 import numpy as np
 from src.preprocessing.preprocessor import single_preprocessing
 from src.spectttra.spectttra_trainer import spectttra_train
@@ -86,7 +87,12 @@ class MusicLIMEPredictor:
         # Step 2: Batch feature extraction
         start_time = time.time()
         print("[MusicLIME] Extracting audio features (batch)...")
-        audio_features_batch = spectttra_train(processed_audios)  # (batch, 384)
         audio_time = time.time() - start_time
         print(
             green_bold(
@@ -99,6 +105,11 @@ class MusicLIMEPredictor:
         lyrics_features_batch = l2vec_train(
             self.llm2vec_model, processed_lyrics
         )  # (batch, 2048)
         lyrics_time = time.time() - start_time
         print(
             green_bold(

 import time
 import joblib
 import numpy as np
+import torch
 from src.preprocessing.preprocessor import single_preprocessing
 from src.spectttra.spectttra_trainer import spectttra_train
         # Step 2: Batch feature extraction
         start_time = time.time()
         print("[MusicLIME] Extracting audio features (batch)...")
+        audio_features_batch = spectttra_train(processed_audios)
+        # Clear GPU cache after audio processing
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
         audio_time = time.time() - start_time
         print(
             green_bold(
         lyrics_features_batch = l2vec_train(
             self.llm2vec_model, processed_lyrics
         )  # (batch, 2048)
+        # Clear GPU cache after lyrics processing
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
         lyrics_time = time.time() - start_time
         print(
             green_bold(

src/spectttra/spectttra_trainer.py CHANGED Viewed

@@ -166,35 +166,50 @@ def spectttra_train(audio_tensors):
     model = _MODEL
     device = _DEVICE
-    # Refactors the loop to be a much faster single-batch operation
-    try:
-        waveforms_batch = torch.cat(audio_tensors, dim=0).to(
-            device, dtype=torch.float32
-        )
-    except Exception as e:
         print(
-            f"[INFO] Error during tensor concatenation, falling back to loop. Fix preprocessing for speed. Error: {e}"
         )
-        batch_list = [spectttra_predict(w) for w in audio_tensors]
-        return np.array(batch_list)
-    with torch.no_grad():
-        melspec = feat_ext(waveforms_batch)
-        # Ensure melspec shape matches model's expectation
-        expected_frames = model.input_temp_dim  # expected_frames is 3744
-        if melspec.shape[2] > expected_frames:
-            melspec = melspec[:, :, :expected_frames]
-        elif melspec.shape[2] < expected_frames:
-            padding = expected_frames - melspec.shape[2]
-            melspec = torch.nn.functional.pad(melspec, (0, padding))
         if device.type == "cuda":
-            with torch.cuda.amp.autocast(enabled=True):
-                tokens = model(melspec)
-                pooled = tokens.mean(dim=1)
-        else:
-            tokens = model(melspec)
-            pooled = tokens.mean(dim=1)
-    return pooled.cpu().numpy()

     model = _MODEL
     device = _DEVICE
+    # Chunk processing: Process in smaller batches
+    chunk_size = 50
+    all_embeddings = []
+    for i in range(0, len(audio_tensors), chunk_size):
+        chunk = audio_tensors[i : i + chunk_size]
         print(
+            f"[INFO] Processing chunk {i//chunk_size + 1}/{(len(audio_tensors)-1)//chunk_size + 1} ({len(chunk)} samples)"
         )
+        try:
+            waveforms_batch = torch.cat(chunk, dim=0).to(device).float()
+        except Exception as e:
+            print(
+                f"[INFO] Error during tensor concatenation, falling back to loop. Error: {e}"
+            )
+            batch_list = [spectttra_predict(w) for w in chunk]
+            all_embeddings.extend(batch_list)
+            continue
+        with torch.no_grad():
+            melspec = feat_ext(waveforms_batch)
+            # Ensure melspec shape matches model's expectation
+            expected_frames = model.input_temp_dim
+            if melspec.shape[2] > expected_frames:
+                melspec = melspec[:, :, :expected_frames]
+            elif melspec.shape[2] < expected_frames:
+                padding = expected_frames - melspec.shape[2]
+                melspec = torch.nn.functional.pad(melspec, (0, padding))
+            if device.type == "cuda":
+                with torch.cuda.amp.autocast(enabled=True):
+                    tokens = model(melspec)
+                    pooled = tokens.mean(dim=1)
+            else:
+                tokens = model(melspec)
+                pooled = tokens.mean(dim=1)
+        chunk_embeddings = pooled.cpu().numpy()
+        all_embeddings.append(chunk_embeddings)
+        # Clear GPU cache after each chunk
         if device.type == "cuda":
+            torch.cuda.empty_cache()
+    return np.vstack(all_embeddings)