Spaces:

krislette
/

bach-or-bot

Sleeping

App Files Files Community

krislette commited on Oct 11, 2025

Commit

75d43d2

1 Parent(s): 5fe7fd6

Auto-deploy from GitHub: 5ac21603a8274a2350875ec7db1bd58cbf2ee539

Browse files

Files changed (16) hide show

config/data_config.yml +7 -3
config/model_config.yml +5 -5
scripts/explain_test.py +1 -1
scripts/predict.py +38 -27
scripts/train.py +111 -66
src/llm2vectrain/llm2vec_trainer.py +3 -107
src/llm2vectrain/model.py +1 -0
src/models/mlp.py +12 -6
src/musiclime/factorization.py +14 -1
src/musiclime/wrapper.py +25 -27
src/preprocessing/audio_preprocessor.py +41 -10
src/preprocessing/preprocessor.py +70 -12
src/spectttra/spectttra.py +99 -1
src/spectttra/spectttra_trainer.py +43 -74
src/utils/config_loader.py +5 -1
src/utils/dataset.py +193 -23

config/data_config.yml CHANGED Viewed

@@ -1,8 +1,12 @@
 base_dir: "."
 paths:
-  dataset_npz: "data/processed/training_data.npz"
-  dataset_csv: "data/external/songs_dataset.csv"
   raw_dir: "data/raw"
   processed_dir: "data/processed"
-  pca_path: "data/processed/pca_model.pkl"

 base_dir: "."
 paths:
+  dataset_npz: "data/processed/training_data_40k.npz"
+  dataset_csv: "data/external/dataset_40000.csv"
   raw_dir: "data/raw"
   processed_dir: "data/processed"
+  pca_path: "models/fusion/pca.pkl"
+  lyrics_scaler: "models/fusion/lyrics_scaler.pkl"
+  pca_scaler: "models/fusion/pca_scaler.pkl"
+  audio_scaler: "models/fusion/audio_scaler.pkl"
+  raw_dataset_npz: "data/processed/raw_training_data_40k.npz"

config/model_config.yml CHANGED Viewed

@@ -1,11 +1,11 @@
 mlp:
-  hidden_layers: [1024, 512, 256, 128, 64, 32] # 6 hidden layers
-  dropout: [0.4, 0.3, 0.5, 0.5, 0.5] # Dropout rates for each layer
   learning_rate: 0.0001 # Adam optimizer
   batch_size: 128 # Number of samples processed together
   epochs: 200 # Maximum training iterations
-  patience: 5 # Early stopping patience
-  weight_decay: 0.1 # L2 regularization
   gradient_clipping: 0.5 # Prevent exploding gradients
-  mixup_alpha: 0.2 # For data augmentation during trainign, 0 disables MixUp

 mlp:
+  hidden_layers: [512, 256, 128] # 3 hidden layers
+  dropout: [0.5, 0.4, 0.3] # Dropout rates for each layer
   learning_rate: 0.0001 # Adam optimizer
   batch_size: 128 # Number of samples processed together
   epochs: 200 # Maximum training iterations
+  patience: 15 # Early stopping patience
+  weight_decay: 0.01 # L2 regularization
   gradient_clipping: 0.5 # Prevent exploding gradients
+  mixup_alpha: 0.1 # For data augmentation during training, 0 disables MixUp

scripts/explain_test.py CHANGED Viewed

@@ -34,7 +34,7 @@ def explain():
         audio=y,
         lyrics=lyrics_text,
         predict_fn=predictor,
-        num_samples=1000,
         labels=(1,),
     )

         audio=y,
         lyrics=lyrics_text,
         predict_fn=predictor,
+        num_samples=5,
         labels=(1,),
     )

scripts/predict.py CHANGED Viewed

@@ -3,16 +3,13 @@ from src.spectttra.spectttra_trainer import spectttra_predict
 from src.llm2vectrain.model import load_llm2vec_model
 from src.llm2vectrain.llm2vec_trainer import l2vec_single_train, load_pca_model
 from src.models.mlp import build_mlp, load_config
-from pathlib import Path
-from src.utils.config_loader import DATASET_NPZ
 from src.utils.dataset import instance_scaler
-from pathlib import Path
 import numpy as np
-import torch
-def predict_pipeline(audio, lyrics: str):
     """
     Predict script which includes preprocessing, feature extraction, and
     training the MLP model for a single data sample.
@@ -34,49 +31,63 @@ def predict_pipeline(audio, lyrics: str):
         A numerical representation of the prediction
     """
-    # Instantiate X and Y vectors
-    X, Y = None, None
-    # Instantiate LLM2Vec Model
     llm2vec_model = load_llm2vec_model()
-    # Preprocess both audio and lyrics
-    audio, lyrics = single_preprocessing(audio, lyrics)
-    # Call the train method for both models
     audio_features = spectttra_predict(audio)
     lyrics_features = l2vec_single_train(llm2vec_model, lyrics)
-    # Reduce the lyrics using saved PCA model
     reduced_lyrics = load_pca_model(lyrics_features)
-    # Scale the vectors using Z-Score
     audio_features, reduced_lyrics = instance_scaler(audio_features, reduced_lyrics)
-    # Concatenate the vectors of audio_features + lyrics_features
     results = np.concatenate([audio_features, reduced_lyrics], axis=1)
     # ---- Load MLP Classifier ----
     config = load_config("config/model_config.yml")
     classifier = build_mlp(input_dim=results.shape[1], config=config)
-    # Load trained weights (make sure this path matches where you saved your model)
-    model_path = "models/mlp/mlp_multimodal.pth"
     classifier.load_model(model_path)
     classifier.model.eval()
-    # Run prediction
-    probability, prediction, label = classifier.predict_single(results)
-    return {
-        "probability": probability,
-        "label": label,
-        "prediction": "AI-Generated" if prediction == 0 else "Human-Composed",
-    }
 if __name__ == "__main__":
     # Example usage (replace with real inputs, place song inside data/raw.)
-    audio = "sample"
-    lyrics = "Some lyrics text here"
-    print(predict_pipeline(audio, lyrics))

 from src.llm2vectrain.model import load_llm2vec_model
 from src.llm2vectrain.llm2vec_trainer import l2vec_single_train, load_pca_model
 from src.models.mlp import build_mlp, load_config
 from src.utils.dataset import instance_scaler
 import numpy as np
+import pandas as pd
+def predict_pipeline(audio_file, lyrics):
     """
     Predict script which includes preprocessing, feature extraction, and
     training the MLP model for a single data sample.
         A numerical representation of the prediction
     """
+    # 1.) Instantiate LLM2Vec Model
     llm2vec_model = load_llm2vec_model()
+    # 2.) Preprocess both audio and lyrics
+    audio, lyrics = single_preprocessing(audio_file, lyrics)
+    # 3.) Call the train method for both models
     audio_features = spectttra_predict(audio)
     lyrics_features = l2vec_single_train(llm2vec_model, lyrics)
+    # 4.) Scale the vectors using Z-Score
+    audio_features, lyrics_features = instance_scaler(audio_features, lyrics_features)
+    # 5.) Reduce the lyrics using saved PCA model
     reduced_lyrics = load_pca_model(lyrics_features)
+    # Scale the vectors using Z-Score again
     audio_features, reduced_lyrics = instance_scaler(audio_features, reduced_lyrics)
+    # 6.) Concatenate the vectors of audio_features + lyrics_features
     results = np.concatenate([audio_features, reduced_lyrics], axis=1)
     # ---- Load MLP Classifier ----
     config = load_config("config/model_config.yml")
     classifier = build_mlp(input_dim=results.shape[1], config=config)
+    # 7.) Load trained weights (make sure this path matches where you saved your model)
+    model_path = "models/mlp/mlp_best.pth"
     classifier.load_model(model_path)
     classifier.model.eval()
+    # 8.) Run prediction
+    probability, prediction, label = classifier.predict_single(results.flatten())
+    return {"probability": probability, "prediction": prediction, "label": label}
 if __name__ == "__main__":
     # Example usage (replace with real inputs, place song inside data/raw.)
+    data = pd.read_csv("data/raw/predict_data_final.csv")
+    result = []
+    label = []
+    for row in data.itertuples():
+        prediction = predict_pipeline(row.song, row.lyrics)
+        result.append(
+            {
+                "song": row.song,
+                "label": row.label,
+                "predicted_label": prediction["label"],
+                "probability": prediction["probability"],
+            }
+        )
+    for r in result:
+        print(f"Song: {r['song']}")
+        print(f"Actual Label: {r['label']}")
+        print(f"Predicted: {r['predicted_label']}")
+        print(f"Confidence: {r['probability']: .8f}%")
+        print("-" * 50)

scripts/train.py CHANGED Viewed

@@ -4,14 +4,14 @@ from src.llm2vectrain.model import load_llm2vec_model
 from src.llm2vectrain.llm2vec_trainer import l2vec_train
 from src.models.mlp import build_mlp, load_config
-from src.utils.config_loader import DATASET_NPZ, PCA_MODEL
-from src.utils.dataset import dataset_scaler, dataset_splitter
-from sklearn.decomposition import PCA
 from pathlib import Path
 import numpy as np
 import logging
-import joblib
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
@@ -21,6 +21,10 @@ def train_mlp_model(data : dict):
     """
     Train the MLP model with extracted features.
     Parameters
     ----------
         data : dict{np.array}
@@ -31,6 +35,11 @@ def train_mlp_model(data : dict):
     # Load MLP configuration
     config = load_config("config/model_config.yml")
     # Destructure the dictionary to get data split
     X_train, y_train = data["train"]
     X_val, y_val     = data["val"]
@@ -47,6 +56,7 @@ def train_mlp_model(data : dict):
     # Load best model and evaluate on test set
     try:
         mlp_classifier.load_model("models/mlp/mlp_best.pth")
         logger.info("Loaded best model for final evaluation")
     except FileNotFoundError:
@@ -55,8 +65,10 @@ def train_mlp_model(data : dict):
     # Final evaluation
     test_results = mlp_classifier.evaluate(X_test, y_test)
     # Save final model
     mlp_classifier.save_model("models/mlp/mlp_multimodal.pth")
     logger.info("MLP training completed successfully!")
     logger.info(f"Final test accuracy: {test_results['test_accuracy']:.2f}%")
@@ -64,6 +76,7 @@ def train_mlp_model(data : dict):
     return mlp_classifier
 def train_pipeline():
     """
     Training script which includes preprocessing, feature extraction, and training the MLP model.
@@ -79,82 +92,114 @@ def train_pipeline():
     None
     """
-    # Instantiate X and Y vectors
-    X, Y = None, None
-    dataset_path = Path(DATASET_NPZ)
     if dataset_path.exists():
         logger.info("Training dataset already exists. Loading file...")
-        loaded_data = np.load(DATASET_NPZ)
-        X = loaded_data["X"]
-        Y = loaded_data["Y"]
     else:
         logger.info("Training dataset does not exist. Processing data...")
         # Get batches from dataset and return full Y labels
-        batches, Y = dataset_read(batch_size=500)
         batch_count = 1
-        # Instantiate LLM2Vec and PCA model
-        llm2vec_model = load_llm2vec_model()
-        # Preallocate spaces for both audio and lyric vectors to reduce memory overhead
-        audio_vectors = np.zeros((len(Y), 384), dtype=np.float32)
-        lyric_vectors = np.zeros((len(Y), 4096), dtype=np.float32)
-        start_idx = 0
-        for batch in batches:
-            logger.info(f"Bulk Preprocessing - Batch {batch_count}.")
-            audio, lyrics = bulk_preprocessing(batch, batch_count)
-            batch_count += 1
-            # Call the train methods for both SpecTTTra and LLM2Vec
-            logger.info("Starting SpecTTTra feature extraction...")
-            audio_features = spectttra_train(audio)
-            logger.info("Starting LLM2Vec feature extraction...")
-            lyrics_features = l2vec_train(llm2vec_model, lyrics)
-            batch_size = audio_features.shape[0]
-            # Store the results on preallocated spaces
-            audio_vectors[start_idx:start_idx + batch_size, :] = audio_features
-            lyric_vectors[start_idx:start_idx + batch_size, :] = lyrics_features
-            # Delete stored instance for next batch to remove overhead
-            del audio, lyrics, audio_features, lyrics_features
-        # Run standard scaling on audio and lyrics separately
-        logger.info("Running standard scaling for audio and lyrics...")
-        audio_vectors, lyric_vectors = dataset_scaler(audio_vectors, lyric_vectors)
-        # Start training the PCA to the collected lyrics features
-        logger.info("PCA Training on lyric vectors...")
-        pca = PCA(n_components=256, svd_solver="randomized", random_state=42)
-        lyric_vectors = pca.fit_transform(lyric_vectors)
-        # Save the trained PCA model
-        joblib.dump(pca, "models/fusion/pca.pkl")
-        # Concatenate audio features and reduced lyrics features
-        X = np.concatenate([audio_vectors, lyric_vectors], axis=1)
-        logger.info(f"Audio and Lyrics Concatenated. Final features shape: {X.shape}")
-        # Convert label list into np.array
-        Y = np.array(Y)
-        # Save both X and Y to an .npz file for easier loading
-        logger.info("Saving dataset for future testing...")
-        np.savez(DATASET_NPZ, X=X, Y=Y)
-    # Do data splitting
-    data = dataset_splitter(X, Y)
     logger.info("Starting MLP training...")
     train_mlp_model(data)
 if __name__ == "__main__":
     train_pipeline()

 from src.llm2vectrain.llm2vec_trainer import l2vec_train
 from src.models.mlp import build_mlp, load_config
+from src.utils.config_loader import DATASET_NPZ
 from pathlib import Path
+from src.utils.config_loader import DATASET_NPZ, RAW_DATASET_NPZ
+from src.utils.dataset import scale_pca
 import numpy as np
 import logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
     """
     Train the MLP model with extracted features.
+    Parameters
+    ----------
+        data : dict{np.array}
+            A dictionary of np.arrays, containing the train/test/val split.
     Parameters
     ----------
         data : dict{np.array}
     # Load MLP configuration
     config = load_config("config/model_config.yml")
+    # Destructure the dictionary to get data split
+    X_train, y_train = data["train"]
+    X_val, y_val     = data["val"]
+    X_test, y_test   = data["test"]
     # Destructure the dictionary to get data split
     X_train, y_train = data["train"]
     X_val, y_val     = data["val"]
     # Load best model and evaluate on test set
     try:
+        mlp_classifier.load_model("models/mlp/mlp_best.pth")
         mlp_classifier.load_model("models/mlp/mlp_best.pth")
         logger.info("Loaded best model for final evaluation")
     except FileNotFoundError:
     # Final evaluation
     test_results = mlp_classifier.evaluate(X_test, y_test)
     # Save final model
     mlp_classifier.save_model("models/mlp/mlp_multimodal.pth")
+    mlp_classifier.save_model("models/mlp/mlp_multimodal.pth")
     logger.info("MLP training completed successfully!")
     logger.info(f"Final test accuracy: {test_results['test_accuracy']:.2f}%")
     return mlp_classifier
 def train_pipeline():
     """
     Training script which includes preprocessing, feature extraction, and training the MLP model.
     None
     """
+    # Set constant sizes
+    BATCH_SIZE = 200
+    AUDIO_SIZE = 384
+    LYRIC_SIZE = 2048
+    dataset_path = Path(RAW_DATASET_NPZ)
     if dataset_path.exists():
         logger.info("Training dataset already exists. Loading file...")
+        loaded_data = np.load(RAW_DATASET_NPZ)
+        data = {
+            "train": (loaded_data["X_train"], loaded_data["y_train"]),
+            "test":  (loaded_data["X_test"], loaded_data["y_test"]),
+            "val":   (loaded_data["X_val"], loaded_data["y_val"]),
+        }
     else:
+        logger.info("Training dataset does not exist. Processing data...")
         logger.info("Training dataset does not exist. Processing data...")
         # Get batches from dataset and return full Y labels
+        splits, split_lengths = dataset_read(batch_size=BATCH_SIZE)
         batch_count = 1
+        # Instantiate LLM2Vec Model
+        l2v = load_llm2vec_model()
+        # Preallocate arrays
+        X_train = np.zeros((split_lengths[0], AUDIO_SIZE + LYRIC_SIZE), dtype=np.float32)
+        X_test  = np.zeros((split_lengths[1], AUDIO_SIZE + LYRIC_SIZE), dtype=np.float32)
+        X_val   = np.zeros((split_lengths[2], AUDIO_SIZE + LYRIC_SIZE), dtype=np.float32)
+        y_train = np.zeros(split_lengths[0], dtype=np.int32)
+        y_test  = np.zeros(split_lengths[1], dtype=np.int32)
+        y_val   = np.zeros(split_lengths[2], dtype=np.int32)
+        X_splits = [X_train, X_test, X_val]
+        y_splits = [y_train, y_test, y_val]
+        # Loop through the three splits
+        for split_idx, split in enumerate(splits):
+            start_idx = 0
+            # Loop through batches for each split
+            for batch in split:
+                if len(batch) == 0:
+                    continue  # skip empty batch safely
+                logger.info(f"Bulk Preprocessing batch {batch_count}...")
+                audio, lyrics = bulk_preprocessing(batch, batch_count)
+                batch_labels = batch['target'].values
+                # Extract audio features
+                logger.info("Starting SpecTTTra feature extraction...")
+                audio_features = spectttra_train(audio)
+                # Call the train method for LLM2Vec
+                logger.info(f"\nStarting LLM2Vec feature extraction...")
+                lyric_features = l2vec_train(l2v, lyrics)
+                # Concatenate the two features
+                batch_feature = np.concatenate([audio_features, lyric_features], axis=1)
+                # Allocate them to the preallocated blocks
+                bsz = batch_feature.shape[0]
+                X_splits[split_idx][start_idx:start_idx + bsz, :] = batch_feature
+                y_splits[split_idx][start_idx:start_idx + bsz] = batch_labels
+                logger.info(f"Batch {batch_count}: {bsz} samples, start_idx={start_idx}")
+                batch_count += 1
+                start_idx += bsz
+        # Save raw (unscaled) dataset
+        logger.info("Saving raw dataset...")
+        np.savez(
+            RAW_DATASET_NPZ,
+            X_train=X_train, y_train=y_train,
+            X_val=X_val,     y_val=y_val,
+            X_test=X_test,   y_test=y_test,
+        )
+        # Run scaling
+        logger.info("Running standard scaling...")
+        data = {
+            "train": (X_train, y_train),
+            "val":   (X_val, y_val),
+            "test":  (X_test, y_test),
+        }
+    # Scale and use PCA fitting for all raw data
+    logger.info("Scaling and applying PCA...")
+    data = scale_pca(data)
+    # Save scaled dataset
+    X_train, y_train = data["train"]
+    X_val, y_val     = data["val"]
+    X_test, y_test   = data["test"]
+    logger.info("Saving scaled dataset...")
+    np.savez(
+        DATASET_NPZ,
+        X_train=X_train, y_train=y_train,
+        X_val=X_val,     y_val=y_val,
+        X_test=X_test,   y_test=y_test,
+    )
     logger.info("Starting MLP training...")
     train_mlp_model(data)
 if __name__ == "__main__":
     train_pipeline()

src/llm2vectrain/llm2vec_trainer.py CHANGED Viewed

@@ -1,115 +1,11 @@
-from sklearn.decomposition import IncrementalPCA
-from sklearn.preprocessing import StandardScaler
 from pathlib import Path
-import numpy as np
-import pickle
-import torch
-import os
 import joblib
-# Initialize PCA and StandardScaler globally for training
-_pca_trainer = None
-class SimplePCATrainer:
-    """
-    A simple PCA trainer that uses IncrementalPCA to fit data in batches.
-    It saves checkpoints every 5 batches and can save the final model.
-    Args:
-        None
-    Returns:
-        None
-    Attributes:
-        pca: The IncrementalPCA model.
-        scaler: StandardScaler for normalizing data.
-        fitted: Boolean indicating if the model has been initialized.
-        batch_count_pca: Counter for the number of batches processed.
-    Methods:
-        process_batch(vectors): Processes a batch of vectors, fits the PCA model incrementally.
-        save_final(model_path): Saves the final PCA model to the specified path.
-    """
-    # Initialize the trainer
-    def __init__(self):
-        self.pca = None
-        self.scaler = StandardScaler()
-        self.fitted = False
-        self.batch_count_pca = 0
-    def _determine_optimal_components(self, vectors):
-        """
-        Determine the optimal number of PCA components to retain 95% variance.
-        Args:
-            vectors: The input data to analyze.
-        Returns:
-            n_components: The optimal number of components.
-        """
-        temp_pca = IncrementalPCA()
-        temp_pca.fit(vectors)
-        cumsum_var = np.cumsum(temp_pca.explained_variance_ratio_)
-        n_comp_95 = np.argmax(cumsum_var >= 0.95) + 1
-        return min(n_comp_95, vectors.shape[1] // 2)
-    def process_batch(self, vectors):
-        """
-        Process a batch of vectors, fitting the PCA model incrementally.
-        Args:
-            vectors: The input data batch to process.
-        Returns:
-            reduced_vectors: The PCA-transformed data.
-        Note: This method saves a checkpoint every 5 batches.
-        """
-        if not self.fitted:
-            # First batch - initialize everything
-            n_components = self._determine_optimal_components(vectors)
-            self.pca = IncrementalPCA(n_components=n_components, batch_size=1000)
-            self.scaler.fit(vectors)
-            self.fitted = True
-            print(f"Initialized PCA with {n_components} components")
-        # Process batch
-        vectors_scaled = self.scaler.transform(vectors)
-        self.pca.partial_fit(vectors_scaled)
-        reduced_vectors = self.pca.transform(vectors_scaled)
-        self.batch_count_pca += 1
-        # Save checkpoint every 5 batches
-        if self.batch_count_pca % 5 == 0:
-            os.makedirs("pca_checkpoints", exist_ok=True)
-            with open(f"pca_checkpoints/checkpoint_batch_{self.batch_count_pca}.pkl", 'wb') as f:
-                pickle.dump({'pca': self.pca, 'scaler': self.scaler}, f)
-            print(f"Saved checkpoint at batch {self.batch_count_pca}")
-        print(f"Processed batch {self.batch_count_pca}, shape: {vectors.shape} -> {reduced_vectors.shape}")
-        return reduced_vectors
-    def save_final(self, model_path):
-        """
-        Save the final PCA model to the specified path.
-        Args:
-            model_path: The file path to save the PCA model.
-        Returns:
-            None
-        Note: Change the model path as needed in the data_config.yml file.
-        """
-        os.makedirs(os.path.dirname(model_path), exist_ok=True)
-        with open(model_path, 'wb') as f:
-            pickle.dump({'pca': self.pca, 'scaler': self.scaler}, f)
-        print(f"Final model saved to {model_path}. Total variance explained: {np.sum(self.pca.explained_variance_ratio_):.4f}")
 ## For Single Input
-def load_pca_model(vectors, model_path="models/fusion/pca.pkl"):
     """
     Load a pre-trained PCA model and transform the input vectors.

 from pathlib import Path
+from src.utils.config_loader import PCA_MODEL
 import joblib
+import torch
 ## For Single Input
+def load_pca_model(vectors, model_path=PCA_MODEL):
     """
     Load a pre-trained PCA model and transform the input vectors.

src/llm2vectrain/model.py CHANGED Viewed

@@ -16,6 +16,7 @@ def load_llm2vec_model():
     tokenizer = AutoTokenizer.from_pretrained(
         model_id, padding=True, truncation=True, max_length=512, cache_dir=cache_dir
     )
     config = AutoConfig.from_pretrained(
         model_id, trust_remote_code=True, cache_dir=cache_dir
     )

     tokenizer = AutoTokenizer.from_pretrained(
         model_id, padding=True, truncation=True, max_length=512, cache_dir=cache_dir
     )
     config = AutoConfig.from_pretrained(
         model_id, trust_remote_code=True, cache_dir=cache_dir
     )

src/models/mlp.py CHANGED Viewed

@@ -52,6 +52,7 @@ import torch.nn as nn
 import torch.optim as optim
 import numpy as np
 import yaml
 logger = logging.getLogger(__name__)
@@ -441,7 +442,7 @@ class MLPClassifier:
         return probabilities, predictions
-    def predict_single(self, features: np.ndarray) -> Tuple[float, int, str]:
         """
         Predict whether a single song is AI-generated or human-composed.
@@ -482,14 +483,19 @@ class MLPClassifier:
                 f"Expected features for 1 song, got {features.shape[0]} songs. Use predict_batch() instead."
             )
-        # Use the existing predict method
-        probabilities, predictions = self.predict(features)
         # Extract single results
-        probability = float(probabilities[0])
-        prediction = int(predictions[0])
         label = "Human-Composed" if prediction == 1 else "AI-Generated"
         return probability, prediction, label
     def predict_batch(self, features: np.ndarray, return_details: bool = False) -> Dict:

 import torch.optim as optim
 import numpy as np
 import yaml
+import torch.nn.functional as F
 logger = logging.getLogger(__name__)
         return probabilities, predictions
+    def predict_single(self, features: np.ndarray, temperature: float = 2.5) -> Tuple[float, int, str]:
         """
         Predict whether a single song is AI-generated or human-composed.
                 f"Expected features for 1 song, got {features.shape[0]} songs. Use predict_batch() instead."
             )
+        self.model.eval()
+        with torch.no_grad():
+            features_tensor = torch.FloatTensor(features).to(self.device)
+            outputs = self.model(features_tensor)
+            logit = torch.logit(outputs.clamp(1e-6, 1 - 1e-6))
+            probabilities = torch.sigmoid(logit / temperature).item()
+            probabilities = np.clip(probabilities, 0.01, 0.99)
         # Extract single results
+        prediction = int(probabilities >= 0.5)
         label = "Human-Composed" if prediction == 1 else "AI-Generated"
+        probability = probabilities*100 if prediction == 1 else (1 - probabilities)*100
         return probability, prediction, label
     def predict_batch(self, features: np.ndarray, return_details: bool = False) -> Dict:

src/musiclime/factorization.py CHANGED Viewed

@@ -61,7 +61,20 @@ class OpenUnmixFactorization:
     def _separate_sources(self):
         waveform = np.expand_dims(self.audio, axis=1)
-        prediction = predict.separate(torch.as_tensor(waveform).float(), rate=44100)
         components = [prediction[key][0].mean(dim=0).numpy() for key in prediction]
         names = list(prediction.keys())

     def _separate_sources(self):
         waveform = np.expand_dims(self.audio, axis=1)
+        # Load openunmix .pth files from local dir
+        model_path = "models/musiclime"
+        # Specify targets
+        targets = ["vocals", "bass", "drums", "other"]
+        # Then load openunmix files to openunmix' method
+        prediction = predict.separate(
+            torch.as_tensor(waveform).float(),
+            rate=44100,
+            model_str_or_path=model_path,
+            targets=targets,
+        )
         components = [prediction[key][0].mean(dim=0).numpy() for key in prediction]
         names = list(prediction.keys())

src/musiclime/wrapper.py CHANGED Viewed

@@ -38,8 +38,6 @@ class MusicLIMEPredictor:
         processed_lyrics = []
         for i, (text, audio) in enumerate(zip(texts, audios)):
-            # if i % 100 == 0:
-            #     print(f"   Preprocessing {i+1}/{len(texts)}")
             processed_audio, processed_lyric = single_preprocessing(audio, text)
             processed_audios.append(processed_audio)
             processed_lyrics.append(processed_lyric)
@@ -74,44 +72,49 @@ class MusicLIMEPredictor:
             )
         )
-        # Step 3: Apply PCA to lyrics batch first
         start_time = time.time()
-        print("[MusicLIME] Applying PCA to lyrics (batch)")
-        pca_model = joblib.load("models/fusion/pca.pkl")
-        reduced_lyrics_batch = pca_model.transform(
-            lyrics_features_batch
-        )  # (batch, 256)
-        pca_time = time.time() - start_time
-        print(green_bold(f"[MusicLIME] PCA completed in {pca_time:.2f}s"))
-        # Step 4: Scale the reduced features
-        start_time = time.time()
-        print("[MusicLIME] Scaling features (batch)...")
         audio_scaler = joblib.load("models/fusion/audio_scaler.pkl")
-        lyric_scaler = joblib.load("models/fusion/lyric_scaler.pkl")
         scaled_audio_batch = audio_scaler.transform(
             audio_features_batch
         )  # (batch, 384)
         scaled_lyrics_batch = lyric_scaler.transform(
             reduced_lyrics_batch
-        )  # (batch, 256)
-        # Step 5: Concatenate features
         combined_features_batch = np.concatenate(
-            [scaled_audio_batch, scaled_lyrics_batch], axis=1
-        )
         scaling_time = time.time() - start_time
         print(green_bold(f"[MusicLIME] Scaling completed in {scaling_time:.2f}s"))
-        # Step 6: Batch MLP prediction
         start_time = time.time()
         print("[MusicLIME] Running MLP predictions (batch)...")
         if self.classifier is None:
             self.classifier = build_mlp(
                 input_dim=combined_features_batch.shape[1], config=self.config
             )
-            self.classifier.load_model("models/mlp/mlp_multimodal.pth")
         probabilities, predictions = self.classifier.predict(combined_features_batch)
@@ -122,17 +125,12 @@ class MusicLIMEPredictor:
         # Total time summary
         total_time = (
-            preprocessing_time
-            + audio_time
-            + lyrics_time
-            + pca_time
-            + scaling_time
-            + mlp_time
         )
         print(f"[MusicLIME] Batch processing complete!")
         print(
             green_bold(
-                f"[MusicLIME] Total time: {total_time:.2f}s (Preprocessing: {preprocessing_time:.2f}s, Audio: {audio_time:.2f}s, Lyrics: {lyrics_time:.2f}s, PCA: {pca_time:.2f}s, Scaling: {scaling_time:.2f}s, MLP: {mlp_time:.2f}s)"
             )
         )

         processed_lyrics = []
         for i, (text, audio) in enumerate(zip(texts, audios)):
             processed_audio, processed_lyric = single_preprocessing(audio, text)
             processed_audios.append(processed_audio)
             processed_lyrics.append(processed_lyric)
             )
         )
+        # Step 3: Scale and reduce in batch
         start_time = time.time()
+        print("[MusicLIME] Scaling and reducing features (batch)...")
+        # Load the trained scalers
         audio_scaler = joblib.load("models/fusion/audio_scaler.pkl")
+        lyric_scaler = joblib.load("models/fusion/lyrics_scaler.pkl")
+        # Then apply scaling to the batch
         scaled_audio_batch = audio_scaler.transform(
             audio_features_batch
         )  # (batch, 384)
         scaled_lyrics_batch = lyric_scaler.transform(
+            lyrics_features_batch
+        )  # (batch, 2048)
+        # Step 4: Apply PCA to lyrics batch
+        print("[MusicLIME] Applying PCA to lyrics (batch)")
+        pca_model = joblib.load("models/fusion/pca.pkl")
+        reduced_lyrics_batch = pca_model.transform(scaled_lyrics_batch)  # (batch, 512)
+        # Step 5: Apply scaler to PCA-scaled lyrics batch
+        print("[MusicLIME] Reapplying scaler to PCA-scaled batch")
+        pca_scaler = joblib.load("models/fusion/pca_scaler.pkl")
+        reduced_lyrics_batch = pca_scaler.transform(
             reduced_lyrics_batch
+        )  # (batch, 512)
+        # Step 6: Concatenate features
         combined_features_batch = np.concatenate(
+            [scaled_audio_batch, reduced_lyrics_batch], axis=1
+        )  # (batch, sum of lyrics & audio vector dims)
         scaling_time = time.time() - start_time
         print(green_bold(f"[MusicLIME] Scaling completed in {scaling_time:.2f}s"))
+        # Step 7: Batch MLP prediction
         start_time = time.time()
         print("[MusicLIME] Running MLP predictions (batch)...")
         if self.classifier is None:
             self.classifier = build_mlp(
                 input_dim=combined_features_batch.shape[1], config=self.config
             )
+            self.classifier.load_model("models/mlp/mlp_best.pth")
         probabilities, predictions = self.classifier.predict(combined_features_batch)
         # Total time summary
         total_time = (
+            preprocessing_time + audio_time + lyrics_time + scaling_time + mlp_time
         )
         print(f"[MusicLIME] Batch processing complete!")
         print(
             green_bold(
+                f"[MusicLIME] Total time: {total_time:.2f}s (Preprocessing: {preprocessing_time:.2f}s, Audio: {audio_time:.2f}s, Lyrics: {lyrics_time:.2f}s, Scaling: {scaling_time:.2f}s, MLP: {mlp_time:.2f}s)"
             )
         )

src/preprocessing/audio_preprocessor.py CHANGED Viewed

@@ -39,7 +39,7 @@ class AudioPreprocessor:
     """
-    def __init__(self, script="train", waveform_norm="std"):
         self.SCRIPT = script
         self.INPUT_SAMPLING = 48000
         self.TARGET_SAMPLING = 16000
@@ -71,7 +71,27 @@ class AudioPreprocessor:
                     audiofile = f"{audiofile}.mp3"
                 file = self.INPUT_PATH / audiofile
-                y, sr = librosa.load(str(file), sr=None, mono=False)
             elif isinstance(audiofile, (bytes, io.BytesIO)):
                 file = (
@@ -90,13 +110,20 @@ class AudioPreprocessor:
             else:
                 raise ValueError(f"Unsupported audiofile type: {type(audiofile)}")
-            # Ensure consistent shape (channels, num_samples)
-            if y.ndim == 1:  # mono
-                y = y[None, :]  # (1, num_samples)
             else:
-                y = y.T  # librosa returns (num_samples, channels)
             waveform = torch.from_numpy(y).float()
             return waveform, sr
         except Exception as e:
@@ -182,7 +209,11 @@ class AudioPreprocessor:
         waveform : tensor
             Normalized audio waveform.
         """
-        if method == "std":
             std = waveform.std()
             return waveform / max(std, 1e-6)
         elif method == "minmax":
@@ -202,7 +233,7 @@ class AudioPreprocessor:
             Base filename to use.
         """
         self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
-        print(f"Saving {filename} to {self.OUTPUT_PATH}.")
         output_path = self.OUTPUT_PATH / f"{filename}"
@@ -233,7 +264,7 @@ class AudioPreprocessor:
         # Convert the audio into mono
         if waveform.shape[0] > 1:
-            print("Current audio is stereo. Converting to mono.")
             waveform = waveform.mean(dim=0, keepdim=True)
         # If there is a skip value provided, trim it
@@ -245,7 +276,7 @@ class AudioPreprocessor:
         # Trim if more than 120 seconds, pad if less than
         waveform = self.pad_trim(waveform=waveform, random_crop=train)
-        # Normalize waveform (aligned with SONICS)
         waveform = self.normalize_waveform(waveform, method=self.WAVEFORM_NORM)
         # Add some gaussian noise to the waveform during training

     """
+    def __init__(self, script="train", waveform_norm="peak"):
         self.SCRIPT = script
         self.INPUT_SAMPLING = 48000
         self.TARGET_SAMPLING = 16000
                     audiofile = f"{audiofile}.mp3"
                 file = self.INPUT_PATH / audiofile
+                # FIXED: Force librosa to load properly
+                # Load at native sample rate first, then we will resample later
+                y, sr = librosa.load(str(file), sr=None, mono=False, dtype=np.float32)
+                # If loading fails (all zeros), try with explicit sample rate
+                if np.abs(y).max() < 0.0001:
+                    print(f"Warning: First load failed, trying with sr=48000")
+                    y, sr = librosa.load(
+                        str(file), sr=48000, mono=False, dtype=np.float32
+                    )
+                # Last resort: use soundfile instead
+                if np.abs(y).max() < 0.0001:
+                    print(f"Warning: Librosa failed, trying soundfile")
+                    import soundfile as sf
+                    y, sr = sf.read(str(file), dtype="float32")
+                    if y.ndim == 2:
+                        y = y.T  # soundfile returns (samples, channels)
+                    else:
+                        y = y[None, :]  # make it (1, samples)
             elif isinstance(audiofile, (bytes, io.BytesIO)):
                 file = (
             else:
                 raise ValueError(f"Unsupported audiofile type: {type(audiofile)}")
+            # Verify we actually loaded audio
+            if np.abs(y).max() < 0.0001:
+                raise RuntimeError(
+                    f"Audio file appears to be silent or corrupted: {audiofile}"
+                )
+            # Ensure consistent shape
+            if y.ndim == 1:
+                y = y[None, :]
             else:
+                y = y.T if y.shape[0] > y.shape[1] else y
             waveform = torch.from_numpy(y).float()
             return waveform, sr
         except Exception as e:
         waveform : tensor
             Normalized audio waveform.
         """
+        if method == "peak":
+            # Normalize to [-1, 1] based on max absolute value to preserves relative dynamics
+            peak = waveform.abs().max()
+            return waveform / max(peak, 1e-6)
+        elif method == "std":
             std = waveform.std()
             return waveform / max(std, 1e-6)
         elif method == "minmax":
             Base filename to use.
         """
         self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
+        # print(f"Saving {filename} to {self.OUTPUT_PATH}.")
         output_path = self.OUTPUT_PATH / f"{filename}"
         # Convert the audio into mono
         if waveform.shape[0] > 1:
+            # print("Current audio is stereo. Converting to mono.")
             waveform = waveform.mean(dim=0, keepdim=True)
         # If there is a skip value provided, trim it
         # Trim if more than 120 seconds, pad if less than
         waveform = self.pad_trim(waveform=waveform, random_crop=train)
+        # Normalize waveform (used PEAK)
         waveform = self.normalize_waveform(waveform, method=self.WAVEFORM_NORM)
         # Add some gaussian noise to the waveform during training

src/preprocessing/preprocessor.py CHANGED Viewed

@@ -1,5 +1,6 @@
 import pandas as pd
 import numpy as np
 from src.preprocessing.audio_preprocessor import AudioPreprocessor
 from src.preprocessing.lyrics_preprocessor import LyricsPreprocessor
@@ -51,6 +52,43 @@ def bulk_preprocessing(batch: pd.DataFrame, batch_count: int):
     return audio_list, lyric_list
 def single_preprocessing(audio, lyric: str):
     """
     Preprocesses a single record of audio and lyric data
@@ -82,26 +120,46 @@ def single_preprocessing(audio, lyric: str):
     return processed_song, processed_lyric
-def dataset_read(batch_size = 20):
     """
-    Reads the csv file and returns batches of data
     Parameters
     ----------
-    None
     Returns
     -------
-    data_splits : list
-        List of dataframes acting as batches
-    label : list
-        List of real/fake labels (in the formm of 0 and 1)
     """
     dataset = pd.read_csv(DATASET_CSV)
-    label = dataset['target'].tolist()
-    # Split into x batches (50,000 / x)
-    data_splits = np.array_split(dataset, batch_size)
-    return data_splits, label

 import pandas as pd
 import numpy as np
+import math
 from src.preprocessing.audio_preprocessor import AudioPreprocessor
 from src.preprocessing.lyrics_preprocessor import LyricsPreprocessor
     return audio_list, lyric_list
+def bulk_preprocessing_lyrics(batch: pd.DataFrame, batch_count: int):
+    """
+    Applies lyrics preprocessing to a training batch
+    Parameters
+    ----------
+    batch : pd.dataframe
+        Dataframe containing the batch data.
+    batch_count : int
+        Batch count value.
+    Returns
+    -------
+    lyric_list : list
+        List of loaded lyrics in string form.
+    """
+    lyric_preprocessor = LyricsPreprocessor()
+    lyric_list = []
+    count, batch_length = 1, len(batch)
+    print(f"Preprocessing training data with length {batch_length}\n")
+    for row in batch.itertuples():
+        print(f"Batch {batch_count}     -    {count}/{batch_length}")
+        # Preprocess lyric and append to lyric list
+        processed_lyric = lyric_preprocessor(lyrics=row.lyrics)
+        lyric_list.append(processed_lyric)
+        count += 1
+    return lyric_list
 def single_preprocessing(audio, lyric: str):
     """
     Preprocesses a single record of audio and lyric data
     return processed_song, processed_lyric
+def dataset_read(batch_size=20):
     """
+    Reads the main dataset, splits it into the train/test/valid split, and computes
+    optimal number of samples per batch.
     Parameters
     ----------
+    batch_size : int
+        Number of data per batch
     Returns
     -------
+    split: list[splits]
+        A collection of the three splits
+    split_lengths : list[int]
+        List of the split lengths
     """
     dataset = pd.read_csv(DATASET_CSV)
+    train = dataset[dataset["split"] == "train"]
+    test = dataset[dataset["split"] == "test"]
+    val = dataset[dataset["split"] == "valid"]
+    # Find the minimum split size (ignoring empty splits)
+    min_split_size = min([len(train), len(test), len(val)])
+    # Clamp batch_size so it never exceeds the smallest split
+    effective_batch_size = min(batch_size, min_split_size if min_split_size > 0 else batch_size)
+    def make_splits(df, batch_size):
+        if len(df) == 0:
+            return []
+        n_splits = math.ceil(len(df) / batch_size)
+        return np.array_split(df, n_splits)
+    train_splits = make_splits(train, effective_batch_size)
+    test_splits = make_splits(test, effective_batch_size)
+    val_splits = make_splits(val, effective_batch_size)
+    splits = [train_splits, test_splits, val_splits]
+    split_lengths = [len(train), len(test), len(val)]
+    return splits, split_lengths

src/spectttra/spectttra.py CHANGED Viewed

@@ -1,6 +1,9 @@
 import torch.nn as nn
 from .transformer import Transformer
 from .tokenizer import STTokenizer
 class SpecTTTra(nn.Module):
@@ -112,4 +115,99 @@ class SpecTTTra(nn.Module):
         # Transformer
         output = self.transformer(spectro_temporal_tokens)  # shape: (B, T/t + F/f, dim)
-        return output

+import torch
 import torch.nn as nn
+from pathlib import Path
 from .transformer import Transformer
 from .tokenizer import STTokenizer
+from src.spectttra.feature import FeatureExtractor
 class SpecTTTra(nn.Module):
         # Transformer
         output = self.transformer(spectro_temporal_tokens)  # shape: (B, T/t + F/f, dim)
+        return output
+def build_spectttra_from_cfg(cfg, device):
+    """
+    Constructs the SpecTTTra model and its associated FeatureExtractor from a given configuration.
+    Args:
+        cfg (SimpleNamespace): Configuration object containing model and feature extraction parameters. Expected attributes include:
+                - cfg.melspec.n_mels: Number of mel frequency bins.
+                - cfg.model: Model-specific parameters (e.g., embed_dim, t_clip, f_clip, etc.).
+        device (torch.device): The device on which the model and feature extractor will be allocated (e.g., 'cpu' or 'cuda').
+    Returns:
+        tuple:
+            FeatureExtractor: Initialized feature extraction module moved to the specified device.
+            SpecTTTra: Constructed SpecTTTra model moved to the specified device.
+    """
+    feat_ext = FeatureExtractor(cfg).to(device)
+    # The pre-trained model expects specific, fixed input dimensions.
+    # Hardcoded to ensure the model architecture matches the checkpoint weights exactly.
+    # The expected number of frames (n_frames) is taken directly from the RuntimeError message.
+    n_mels = cfg.melspec.n_mels     # n_mels should be 128
+    n_frames = 3744                 # n_frames match the checkpoint's expectation
+    print(f"[INFO] Initializing SpecTTTra with fixed dimensions: n_mels={n_mels}, n_frames={n_frames}")
+    model_cfg = cfg.model
+    model = SpecTTTra(
+        input_spec_dim=n_mels,
+        input_temp_dim=n_frames,
+        embed_dim=model_cfg.embed_dim,
+        t_clip=model_cfg.t_clip,
+        f_clip=model_cfg.f_clip,
+        num_heads=model_cfg.num_heads,
+        num_layers=model_cfg.num_layers,
+        pre_norm=model_cfg.pre_norm,
+        pe_learnable=model_cfg.pe_learnable,
+        pos_drop_rate=model_cfg.pos_drop_rate,
+        attn_drop_rate=model_cfg.attn_drop_rate,
+        proj_drop_rate=model_cfg.proj_drop_rate,
+        mlp_ratio=model_cfg.mlp_ratio,
+    ).to(device)
+    return feat_ext, model
+def load_frozen_spectttra(model, ckpt_path, device):
+    """
+    Loads pretrained SpecTTTra weights from a frozen checkpoint file.
+    Args:
+        model (torch.nn.Module): An initialized SpecTTTra model instance to load weights into.
+        ckpt_path (str or Path): Path to the pretrained model checkpoint file (e.g., 'spectttra_frozen.pth').
+        device (torch.device): The device to map the loaded weights to (e.g., 'cpu' or 'cuda').
+    Returns:
+        model (torch.nn.Module): The SpecTTTra model with loaded pretrained weights, set to evaluation mode.
+    Raises:
+        FileNotFoundError: If the specified checkpoint file does not exist at `ckpt_path`.
+    """
+    ckpt_path = Path(ckpt_path)
+    if not ckpt_path.exists():
+        raise FileNotFoundError(
+            f"Pre-trained model not found at {ckpt_path}. "
+            "Please download 'pytorch_model.bin', rename to 'spectttra_frozen.pth', "
+            "and place it in the correct directory."
+        )
+    print(f"[INFO] Found SpecTTTra checkpoint at {ckpt_path}. Loading weights...")
+    state = torch.load(ckpt_path, map_location=device)
+    new_state_dict = {}
+    for k, v in state.items():
+        if k.startswith("encoder."):
+            new_key = k[len("encoder."):]
+            new_state_dict[new_key] = v
+        else:
+            new_state_dict[k] = v
+    # Now that the shapes match, this should load without a size mismatch error.
+    missing_keys, unexpected_keys = model.load_state_dict(new_state_dict, strict=False)
+    if missing_keys:
+        # Might see a few missing keys if your SpecTTTra class is slightly different, but the core should load.
+        print(f"[WARNING] Missing keys in model: {missing_keys}")
+    if unexpected_keys:
+        # Seeing 'classifier' or 'ft_extractor' keys here is NORMAL and SAFE.
+        print(f"[INFO] Unused keys in checkpoint: {unexpected_keys}")
+    print("[INFO] Successfully loaded pre-trained SpecTTTra weights.")
+    model.eval()
+    return model

src/spectttra/spectttra_trainer.py CHANGED Viewed

@@ -1,11 +1,10 @@
 import threading
 import torch
 import numpy as np
-from pathlib import Path
 from types import SimpleNamespace
 from src.spectttra.feature import FeatureExtractor
-from src.spectttra.spectttra import SpecTTTra
 # Shared variables for the model and setup, loaded only once and reused (cache)
 _PREDICTOR_LOCK = threading.Lock()
@@ -17,54 +16,10 @@ _DEVICE = None
 def build_spectttra(cfg, device):
     """
-    Initialize SpecTTTra and FeatureExtractor modules, and load a frozen checkpoint.
-    Args:
-        cfg (SimpleNamespace): Configuration containing audio, mel-spectrogram, and model parameters.
-        device (torch.device): Target device for model and feature extractor.
-    Returns:
-        tuple:
-            FeatureExtractor: Module for converting raw audio into mel-spectrogram features.
-            SpecTTTra: Spectro-temporal transformer model initialized with checkpoint weights.
     """
-    feat_ext = FeatureExtractor(cfg).to(device)
-    # Build model once using placeholder input to infer mel and frame dimensions
-    with torch.no_grad():
-        dummy_wave = torch.zeros(1, cfg.audio.max_len, device=device)
-        dummy_mel = feat_ext(dummy_wave.float())
-    _, n_mels, n_frames = dummy_mel.shape
-    model_cfg = cfg.model
-    model = SpecTTTra(
-        input_spec_dim=n_mels,
-        input_temp_dim=n_frames,
-        embed_dim=model_cfg.embed_dim,
-        t_clip=model_cfg.t_clip,
-        f_clip=model_cfg.f_clip,
-        num_heads=model_cfg.num_heads,
-        num_layers=model_cfg.num_layers,
-        pre_norm=model_cfg.pre_norm,
-        pe_learnable=model_cfg.pe_learnable,
-        pos_drop_rate=model_cfg.pos_drop_rate,
-        attn_drop_rate=model_cfg.attn_drop_rate,
-        proj_drop_rate=model_cfg.proj_drop_rate,
-        mlp_ratio=model_cfg.mlp_ratio,
-    ).to(device)
-    # Load frozen checkpoint if it exists; otherwise, save initial state
-    ckpt_path = Path("models/spectttra/spectttra_frozen.pth")
-    if ckpt_path.exists():
-        state = torch.load(ckpt_path, map_location=device)
-        model.load_state_dict(state)
-        print(f"[INFO] Loaded frozen SpecTTTra checkpoint from {ckpt_path}")
-    else:
-        ckpt_path.parent.mkdir(parents=True, exist_ok=True)
-        torch.save(model.state_dict(), ckpt_path)
-        print(f"[INFO] Saved frozen SpecTTTra checkpoint to {ckpt_path}")
-    model.eval()
     return feat_ext, model
@@ -118,20 +73,14 @@ def _init_predictor_once():
         )
         device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         feat_ext, model = build_spectttra(cfg, device)
         feat_ext.to(device)
         # Move model to device (GPU if available) and allow faster inference with mixed precision
-        model.to(device)
-        model.eval()
         # Cache
-        _FEAT_EXT = feat_ext
-        _MODEL = model
-        _CFG = cfg
-        _DEVICE = device
 def spectttra_predict(audio_tensor):
@@ -147,6 +96,7 @@ def spectttra_predict(audio_tensor):
             1D embedding vector of shape (embed_dim,). The embedding is obtained
             by mean-pooling the transformer token outputs.
     """
     global _FEAT_EXT, _MODEL, _CFG, _DEVICE
     _init_predictor_once()
@@ -161,18 +111,25 @@ def spectttra_predict(audio_tensor):
     with torch.no_grad():
         # Extract mel-spectrogram
-        melspec = feat_ext(waveform)        # (B, n_mels, n_frames)
         if device.type == "cuda":
             with torch.cuda.amp.autocast(enabled=True):
-                tokens = model(melspec)     # (B, num_tokens, embed_dim)
-                pooled = tokens.mean(dim=1) # (B, embed_dim)
         else:
             tokens = model(melspec)
             pooled = tokens.mean(dim=1)
-    # Return numpy vector
-    out = pooled.squeeze(0).cpu().numpy()   # (embed_dim,)
     return out
@@ -203,19 +160,31 @@ def spectttra_train(audio_tensors):
     model = _MODEL
     device = _DEVICE
-    batch = []
-    for waveform in audio_tensors:
-        with torch.no_grad():
-            melspec = feat_ext(waveform.float())    # (B, n_mels, n_frames)
-            if device.type == "cuda":
-                with torch.cuda.amp.autocast(enabled=True):
-                    tokens = model(melspec)         # (B, num_tokens, embed_dim)
-                    pooled = tokens.mean(dim=1)     # (B, embed_dim)
-            else:
                 tokens = model(melspec)
                 pooled = tokens.mean(dim=1)
-        batch.append(pooled.cpu().numpy())
-    return np.vstack(batch)

 import threading
 import torch
 import numpy as np
 from types import SimpleNamespace
 from src.spectttra.feature import FeatureExtractor
+from src.spectttra.spectttra import SpecTTTra, build_spectttra_from_cfg, load_frozen_spectttra
 # Shared variables for the model and setup, loaded only once and reused (cache)
 _PREDICTOR_LOCK = threading.Lock()
 def build_spectttra(cfg, device):
     """
+    Wrapper that builds SpecTTTra + FeatureExtractor and loads frozen checkpoint.
     """
+    feat_ext, model = build_spectttra_from_cfg(cfg, device)
+    model = load_frozen_spectttra(model, "models/spectttra/spectttra_frozen.pth", device)
     return feat_ext, model
         )
         device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         feat_ext, model = build_spectttra(cfg, device)
         feat_ext.to(device)
         # Move model to device (GPU if available) and allow faster inference with mixed precision
+        model.to(device).eval()
         # Cache
+        _FEAT_EXT, _MODEL, _CFG, _DEVICE = feat_ext, model, cfg, device
 def spectttra_predict(audio_tensor):
             1D embedding vector of shape (embed_dim,). The embedding is obtained
             by mean-pooling the transformer token outputs.
     """
     global _FEAT_EXT, _MODEL, _CFG, _DEVICE
     _init_predictor_once()
     with torch.no_grad():
         # Extract mel-spectrogram
+        melspec = feat_ext(waveform)
+        # Ensure melspec shape matches model's expectation ---
+        expected_frames = model.input_temp_dim  # expected_frames is 3744
+        if melspec.shape[2] > expected_frames:
+            melspec = melspec[:, :, :expected_frames]
+        elif melspec.shape[2] < expected_frames:
+            padding = expected_frames - melspec.shape[2]
+            melspec = torch.nn.functional.pad(melspec, (0, padding))
         if device.type == "cuda":
             with torch.cuda.amp.autocast(enabled=True):
+                tokens = model(melspec)
+                pooled = tokens.mean(dim=1)
         else:
             tokens = model(melspec)
             pooled = tokens.mean(dim=1)
+    out = pooled.squeeze(0).cpu().numpy()
     return out
     model = _MODEL
     device = _DEVICE
+    # Refactors the loop to be a much faster single-batch operation
+    try:
+        waveforms_batch = torch.cat(audio_tensors, dim=0).to(device).float()
+    except Exception as e:
+        print(f"[INFO] Error during tensor concatenation, falling back to loop. Fix preprocessing for speed. Error: {e}")
+        batch_list = [spectttra_predict(w) for w in audio_tensors]
+        return np.array(batch_list)
+    with torch.no_grad():
+        melspec = feat_ext(waveforms_batch)
+        # Ensure melspec shape matches model's expectation
+        expected_frames = model.input_temp_dim # expected_frames is 3744
+        if melspec.shape[2] > expected_frames:
+            melspec = melspec[:, :, :expected_frames]
+        elif melspec.shape[2] < expected_frames:
+            padding = expected_frames - melspec.shape[2]
+            melspec = torch.nn.functional.pad(melspec, (0, padding))
+        if device.type == "cuda":
+            with torch.cuda.amp.autocast(enabled=True):
                 tokens = model(melspec)
                 pooled = tokens.mean(dim=1)
+        else:
+            tokens = model(melspec)
+            pooled = tokens.mean(dim=1)
+    return pooled.cpu().numpy()

src/utils/config_loader.py CHANGED Viewed

@@ -9,7 +9,11 @@ BASE_DIR = Path(config["base_dir"]).resolve()
 # Resolve paths
 DATASET_NPZ = BASE_DIR / config["paths"]["dataset_npz"]
 DATASET_CSV = BASE_DIR / config["paths"]["dataset_csv"]
 RAW_DIR = BASE_DIR / config["paths"]["raw_dir"]
 PROCESSED_DIR = BASE_DIR / config["paths"]["processed_dir"]
-PCA_MODEL = BASE_DIR / config["paths"]["pca_path"]

 # Resolve paths
 DATASET_NPZ = BASE_DIR / config["paths"]["dataset_npz"]
+RAW_DATASET_NPZ = BASE_DIR / config["paths"]["raw_dataset_npz"]
 DATASET_CSV = BASE_DIR / config["paths"]["dataset_csv"]
 RAW_DIR = BASE_DIR / config["paths"]["raw_dir"]
 PROCESSED_DIR = BASE_DIR / config["paths"]["processed_dir"]
+PCA_MODEL = BASE_DIR / config["paths"]["pca_path"]
+AUDIO_SCALER = BASE_DIR / config["paths"]["audio_scaler"]
+LYRICS_SCALER = BASE_DIR / config["paths"]["lyrics_scaler"]
+PCA_SCALER = BASE_DIR / config["paths"]["pca_scaler"]

src/utils/dataset.py CHANGED Viewed

@@ -1,45 +1,132 @@
-from sklearn.preprocessing import StandardScaler
 from sklearn.model_selection import train_test_split
 import joblib
 import numpy as np
 import logging
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
-def dataset_splitter(X: np.ndarray, Y: np.ndarray):
     """
-    Script that splits the X and Y values to train, test, and valid splits.
     Parameters
     ----------
     X : np.array
-        Array of feature vectors
     Y : np.array
-        Array of labels (real or fake)
     Returns
     -------
-    data : dict{np.array}
-        A dictionary of np.arrays, containing the train/test/val split.
     """
     logger.info(f"Dataset shape: {X.shape}, Labels: {len(Y)}")
     logger.info(f"Class distribution: {np.bincount(Y)}")
-    # Split the data into train/val/test
     X_train, X_test, y_train, y_test = train_test_split(
-        X, Y, test_size=0.1, random_state=42, stratify=Y
     )
     X_train, X_val, y_train, y_val = train_test_split(
         X_train, y_train, test_size=0.2222, random_state=42, stratify=y_train
     )
     logger.info(f"Train: {X_train.shape}, Validation: {X_val.shape}, Test: {X_test.shape}")
     data = {
         "train": (X_train, y_train),
         "val": (X_val, y_val),
@@ -49,6 +136,92 @@ def dataset_splitter(X: np.ndarray, Y: np.ndarray):
     return data
 def dataset_scaler(audio: np.ndarray, lyrics: np.ndarray):
     """
     Method to scale both audio and lyric vectors using Z-Score.
@@ -68,7 +241,7 @@ def dataset_scaler(audio: np.ndarray, lyrics: np.ndarray):
     -------
     scaled_audio : np.array
         Array of scaled audio features
-    scaled_lyrics : np.array
         Array of scaled lyric features
     """
@@ -76,14 +249,11 @@ def dataset_scaler(audio: np.ndarray, lyrics: np.ndarray):
     audio_scaler = StandardScaler().fit(audio)
     lyric_scaler = StandardScaler().fit(lyrics)
-    scaled_audio = audio_scaler.transform(audio)
-    scaled_lyrics = lyric_scaler.transform(lyrics)
     # Save the trained scalers for prediction
-    joblib.dump(audio_scaler, "models/fusion/audio_scaler.pkl")
-    joblib.dump(lyric_scaler, "models/fusion/lyric_scaler.pkl")
-    return scaled_audio, scaled_lyrics
 def instance_scaler(audio: np.ndarray, lyrics: np.ndarray):
@@ -101,15 +271,15 @@ def instance_scaler(audio: np.ndarray, lyrics: np.ndarray):
     -------
     scaled_audio : np.array
         Array of scaled audio feature
-    scaled_lyrics : np.array
         Array of scaled lyric feature
     """
     # Apply scalers to the single inputs
-    audio_scaler = joblib.load("models/fusion/audio_scaler.pkl")
-    lyric_scaler = joblib.load("models/fusion/lyric_scaler.pkl")
     scaled_audio = audio_scaler.transform([audio])
-    scaled_lyrics = lyric_scaler.transform(lyrics)
-    return scaled_audio, scaled_lyrics

+from sklearn.preprocessing import StandardScaler, MinMaxScaler
 from sklearn.model_selection import train_test_split
+from src.utils.config_loader import AUDIO_SCALER, LYRICS_SCALER, PCA_SCALER
+from sklearn.decomposition import IncrementalPCA
+from src.utils.config_loader import PCA_MODEL
 import joblib
 import numpy as np
 import logging
+import pandas as pd
 logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
 logger = logging.getLogger(__name__)
+def dataset_splitter(X: np.ndarray, Y: np.ndarray, ids: np.ndarray = None):
     """
+    Splits X, Y (and optional ids) into train/val/test sets.
+    Saves metadata CSVs for each split if ids are provided.
     Parameters
     ----------
     X : np.array
+        Feature vectors
     Y : np.array
+        Labels
+    ids : np.array, optional
+        Identifiers (filenames or row indices)
+    save_metadata : bool
+        Whether to save split metadata CSVs
+    outdir : str
+        Directory to save metadata CSVs
     Returns
     -------
+    data : dict
+        A dictionary of np.arrays: {train, val, test}
+        Each value is a tuple (X_split, y_split, ids_split if provided)
     """
     logger.info(f"Dataset shape: {X.shape}, Labels: {len(Y)}")
     logger.info(f"Class distribution: {np.bincount(Y)}")
+    # First split: train vs test
     X_train, X_test, y_train, y_test = train_test_split(
+        X, Y, ids, test_size=0.1, random_state=42, stratify=Y
     )
+    # Second split: train vs val
     X_train, X_val, y_train, y_val = train_test_split(
         X_train, y_train, test_size=0.2222, random_state=42, stratify=y_train
     )
     logger.info(f"Train: {X_train.shape}, Validation: {X_val.shape}, Test: {X_test.shape}")
+    data = {
+        "train": (X_train, y_train),
+        "val":   (X_val, y_val),
+        "test":  (X_test, y_test),
+    }
+    return data
+def scale_pca(data : dict):
+    """
+    Script that scales the splits, and applies PCA to the lyrics vector.
+    Parameters
+    ----------
+    data : dictionary
+        Dictionary containing the splits
+    Returns
+    -------
+    data : dict{np.array}
+        A dictionary of np.arrays, containing the train/test/val split.
+    """
+    # Destructure the dictionary to get data split
+    X_train, y_train = data["train"]
+    X_val, y_val     = data["val"]
+    X_test, y_test   = data["test"]
+    # Segment the concatenated embedding to audio and lyrics
+    X_train_audio, X_train_lyrics = X_train[:, :384], X_train[:, 384:]
+    X_test_audio, X_test_lyrics = X_test[:, :384], X_test[:, 384:]
+    X_val_audio, X_val_lyrics = X_val[:, :384], X_val[:, 384:]
+    # Fit the scalers into the train data, return scalers for fitting of test and validation
+    audio_scaler, lyric_scaler = dataset_scaler(X_train_audio, X_train_lyrics)
+    # Transform the rest of the splits using the scalers
+    X_train_audio = audio_scaler.transform(X_train_audio)
+    X_test_audio = audio_scaler.transform(X_test_audio)
+    X_val_audio = audio_scaler.transform(X_val_audio)
+    X_train_lyrics = lyric_scaler.transform(X_train_lyrics)
+    X_test_lyrics = lyric_scaler.transform(X_test_lyrics)
+    X_val_lyrics = lyric_scaler.transform(X_val_lyrics)
+    # Fit PCA on TRAINING lyrics only
+    ipca = IncrementalPCA(n_components=512)
+    batch_size = 1000
+    for i in range(0, X_train_lyrics.shape[0], batch_size):
+        ipca.partial_fit(X_train_lyrics[i:i + batch_size])
+    # Transform in batches
+    X_train_lyrics = ipca.transform(X_train_lyrics)
+    X_test_lyrics = ipca.transform(X_test_lyrics)
+    X_val_lyrics = ipca.transform(X_val_lyrics)
+    # Apply scaler to the PCA output
+    pca_lyric_scaler = StandardScaler().fit(X_train_lyrics)
+    X_train_lyrics = pca_lyric_scaler.transform(X_train_lyrics)
+    X_test_lyrics = pca_lyric_scaler.transform(X_test_lyrics)
+    X_val_lyrics = pca_lyric_scaler.transform(X_val_lyrics)
+    # Concatenate them back to their original form, but scaled
+    X_train = np.concatenate([X_train_audio, X_train_lyrics], axis=1)
+    X_test = np.concatenate([X_test_audio, X_test_lyrics], axis=1)
+    X_val = np.concatenate([X_val_audio, X_val_lyrics], axis=1)
+    joblib.dump(ipca, PCA_MODEL)
+    # Save the trained scalers for prediction
+    joblib.dump(pca_lyric_scaler, PCA_SCALER)
     data = {
         "train": (X_train, y_train),
         "val": (X_val, y_val),
     return data
+def scale_pca_lyrics(data : dict):
+    """
+    Script that scales the splits, and applies PCA to the lyrics vector.
+    Parameters
+    ----------
+    data : dictionary
+        Dictionary containing the splits
+    Returns
+    -------
+    data : dict{np.array}
+        A dictionary of np.arrays, containing the train/test/val split.
+    """
+    # Destructure the dictionary to get data split
+    X_train, y_train = data["train"]
+    X_val, y_val     = data["val"]
+    X_test, y_test   = data["test"]
+    lyric_scaler = StandardScaler().fit(X_train)
+    joblib.dump(lyric_scaler, LYRICS_SCALER)
+    X_train = lyric_scaler.transform(X_train)
+    X_test = lyric_scaler.transform(X_test)
+    X_val = lyric_scaler.transform(X_val)
+    # Fit PCA on TRAINING lyrics only
+    ipca = IncrementalPCA(n_components=512)
+    batch_size = 1000
+    for i in range(0, X_train.shape[0], batch_size):
+        ipca.partial_fit(X_train[i:i + batch_size])
+    # Transform in batches
+    X_train = ipca.transform(X_train)
+    X_test = ipca.transform(X_test)
+    X_val = ipca.transform(X_val)
+    joblib.dump(ipca, PCA_MODEL)
+    data = {
+        "train": (X_train, y_train),
+        "val": (X_val, y_val),
+        "test": (X_test, y_test),
+    }
+    return data
+def scale(data : dict):
+    """
+    Script that scales the splits, and applies PCA to the lyrics vector.
+    Parameters
+    ----------
+    data : dictionary
+        Dictionary containing the splits
+    Returns
+    -------
+    data : dict{np.array}
+        A dictionary of np.arrays, containing the train/test/val split.
+    """
+    # Destructure the dictionary to get data split
+    X_train, y_train = data["train"]
+    X_val, y_val     = data["val"]
+    X_test, y_test   = data["test"]
+    audio_scaler = StandardScaler(with_mean=False).fit(X_train)
+    joblib.dump(audio_scaler, AUDIO_SCALER)
+    # Transform the rest of the splits using the scalers
+    X_train = audio_scaler.transform(X_train)
+    X_test = audio_scaler.transform(X_test)
+    X_val = audio_scaler.transform(X_val)
+    data = {
+        "train": (X_train, y_train),
+        "val": (X_val, y_val),
+        "test": (X_test, y_test),
+    }
+    return data
 def dataset_scaler(audio: np.ndarray, lyrics: np.ndarray):
     """
     Method to scale both audio and lyric vectors using Z-Score.
     -------
     scaled_audio : np.array
         Array of scaled audio features
+    scaleds : np.array
         Array of scaled lyric features
     """
     audio_scaler = StandardScaler().fit(audio)
     lyric_scaler = StandardScaler().fit(lyrics)
     # Save the trained scalers for prediction
+    joblib.dump(audio_scaler, AUDIO_SCALER)
+    joblib.dump(lyric_scaler, LYRICS_SCALER)
+    return audio_scaler, lyric_scaler
 def instance_scaler(audio: np.ndarray, lyrics: np.ndarray):
     -------
     scaled_audio : np.array
         Array of scaled audio feature
+    scaleds : np.array
         Array of scaled lyric feature
     """
     # Apply scalers to the single inputs
+    audio_scaler = joblib.load(AUDIO_SCALER)
+    lyric_scaler = joblib.load(LYRICS_SCALER)
     scaled_audio = audio_scaler.transform([audio])
+    scaled_lyric = lyric_scaler.transform(lyrics)
+    return scaled_audio, scaled_lyric