krislette commited on
Commit
75d43d2
·
1 Parent(s): 5fe7fd6

Auto-deploy from GitHub: 5ac21603a8274a2350875ec7db1bd58cbf2ee539

Browse files
config/data_config.yml CHANGED
@@ -1,8 +1,12 @@
1
  base_dir: "."
2
 
3
  paths:
4
- dataset_npz: "data/processed/training_data.npz"
5
- dataset_csv: "data/external/songs_dataset.csv"
6
  raw_dir: "data/raw"
7
  processed_dir: "data/processed"
8
- pca_path: "data/processed/pca_model.pkl"
 
 
 
 
 
1
  base_dir: "."
2
 
3
  paths:
4
+ dataset_npz: "data/processed/training_data_40k.npz"
5
+ dataset_csv: "data/external/dataset_40000.csv"
6
  raw_dir: "data/raw"
7
  processed_dir: "data/processed"
8
+ pca_path: "models/fusion/pca.pkl"
9
+ lyrics_scaler: "models/fusion/lyrics_scaler.pkl"
10
+ pca_scaler: "models/fusion/pca_scaler.pkl"
11
+ audio_scaler: "models/fusion/audio_scaler.pkl"
12
+ raw_dataset_npz: "data/processed/raw_training_data_40k.npz"
config/model_config.yml CHANGED
@@ -1,11 +1,11 @@
1
  mlp:
2
- hidden_layers: [1024, 512, 256, 128, 64, 32] # 6 hidden layers
3
- dropout: [0.4, 0.3, 0.5, 0.5, 0.5] # Dropout rates for each layer
4
  learning_rate: 0.0001 # Adam optimizer
5
  batch_size: 128 # Number of samples processed together
6
  epochs: 200 # Maximum training iterations
7
- patience: 5 # Early stopping patience
8
 
9
- weight_decay: 0.1 # L2 regularization
10
  gradient_clipping: 0.5 # Prevent exploding gradients
11
- mixup_alpha: 0.2 # For data augmentation during trainign, 0 disables MixUp
 
1
  mlp:
2
+ hidden_layers: [512, 256, 128] # 3 hidden layers
3
+ dropout: [0.5, 0.4, 0.3] # Dropout rates for each layer
4
  learning_rate: 0.0001 # Adam optimizer
5
  batch_size: 128 # Number of samples processed together
6
  epochs: 200 # Maximum training iterations
7
+ patience: 15 # Early stopping patience
8
 
9
+ weight_decay: 0.01 # L2 regularization
10
  gradient_clipping: 0.5 # Prevent exploding gradients
11
+ mixup_alpha: 0.1 # For data augmentation during training, 0 disables MixUp
scripts/explain_test.py CHANGED
@@ -34,7 +34,7 @@ def explain():
34
  audio=y,
35
  lyrics=lyrics_text,
36
  predict_fn=predictor,
37
- num_samples=1000,
38
  labels=(1,),
39
  )
40
 
 
34
  audio=y,
35
  lyrics=lyrics_text,
36
  predict_fn=predictor,
37
+ num_samples=5,
38
  labels=(1,),
39
  )
40
 
scripts/predict.py CHANGED
@@ -3,16 +3,13 @@ from src.spectttra.spectttra_trainer import spectttra_predict
3
  from src.llm2vectrain.model import load_llm2vec_model
4
  from src.llm2vectrain.llm2vec_trainer import l2vec_single_train, load_pca_model
5
  from src.models.mlp import build_mlp, load_config
6
- from pathlib import Path
7
- from src.utils.config_loader import DATASET_NPZ
8
  from src.utils.dataset import instance_scaler
9
 
10
- from pathlib import Path
11
  import numpy as np
12
- import torch
13
 
14
 
15
- def predict_pipeline(audio, lyrics: str):
16
  """
17
  Predict script which includes preprocessing, feature extraction, and
18
  training the MLP model for a single data sample.
@@ -34,49 +31,63 @@ def predict_pipeline(audio, lyrics: str):
34
  A numerical representation of the prediction
35
  """
36
 
37
- # Instantiate X and Y vectors
38
- X, Y = None, None
39
-
40
- # Instantiate LLM2Vec Model
41
  llm2vec_model = load_llm2vec_model()
42
 
43
- # Preprocess both audio and lyrics
44
- audio, lyrics = single_preprocessing(audio, lyrics)
45
 
46
- # Call the train method for both models
47
  audio_features = spectttra_predict(audio)
48
  lyrics_features = l2vec_single_train(llm2vec_model, lyrics)
49
 
50
- # Reduce the lyrics using saved PCA model
 
 
 
51
  reduced_lyrics = load_pca_model(lyrics_features)
52
 
53
- # Scale the vectors using Z-Score
54
  audio_features, reduced_lyrics = instance_scaler(audio_features, reduced_lyrics)
55
 
56
- # Concatenate the vectors of audio_features + lyrics_features
57
  results = np.concatenate([audio_features, reduced_lyrics], axis=1)
58
 
59
  # ---- Load MLP Classifier ----
60
  config = load_config("config/model_config.yml")
61
  classifier = build_mlp(input_dim=results.shape[1], config=config)
62
 
63
- # Load trained weights (make sure this path matches where you saved your model)
64
- model_path = "models/mlp/mlp_multimodal.pth"
65
  classifier.load_model(model_path)
66
  classifier.model.eval()
67
 
68
- # Run prediction
69
- probability, prediction, label = classifier.predict_single(results)
70
 
71
- return {
72
- "probability": probability,
73
- "label": label,
74
- "prediction": "AI-Generated" if prediction == 0 else "Human-Composed",
75
- }
76
 
77
 
78
  if __name__ == "__main__":
79
  # Example usage (replace with real inputs, place song inside data/raw.)
80
- audio = "sample"
81
- lyrics = "Some lyrics text here"
82
- print(predict_pipeline(audio, lyrics))
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3
  from src.llm2vectrain.model import load_llm2vec_model
4
  from src.llm2vectrain.llm2vec_trainer import l2vec_single_train, load_pca_model
5
  from src.models.mlp import build_mlp, load_config
 
 
6
  from src.utils.dataset import instance_scaler
7
 
 
8
  import numpy as np
9
+ import pandas as pd
10
 
11
 
12
+ def predict_pipeline(audio_file, lyrics):
13
  """
14
  Predict script which includes preprocessing, feature extraction, and
15
  training the MLP model for a single data sample.
 
31
  A numerical representation of the prediction
32
  """
33
 
34
+ # 1.) Instantiate LLM2Vec Model
 
 
 
35
  llm2vec_model = load_llm2vec_model()
36
 
37
+ # 2.) Preprocess both audio and lyrics
38
+ audio, lyrics = single_preprocessing(audio_file, lyrics)
39
 
40
+ # 3.) Call the train method for both models
41
  audio_features = spectttra_predict(audio)
42
  lyrics_features = l2vec_single_train(llm2vec_model, lyrics)
43
 
44
+ # 4.) Scale the vectors using Z-Score
45
+ audio_features, lyrics_features = instance_scaler(audio_features, lyrics_features)
46
+
47
+ # 5.) Reduce the lyrics using saved PCA model
48
  reduced_lyrics = load_pca_model(lyrics_features)
49
 
50
+ # Scale the vectors using Z-Score again
51
  audio_features, reduced_lyrics = instance_scaler(audio_features, reduced_lyrics)
52
 
53
+ # 6.) Concatenate the vectors of audio_features + lyrics_features
54
  results = np.concatenate([audio_features, reduced_lyrics], axis=1)
55
 
56
  # ---- Load MLP Classifier ----
57
  config = load_config("config/model_config.yml")
58
  classifier = build_mlp(input_dim=results.shape[1], config=config)
59
 
60
+ # 7.) Load trained weights (make sure this path matches where you saved your model)
61
+ model_path = "models/mlp/mlp_best.pth"
62
  classifier.load_model(model_path)
63
  classifier.model.eval()
64
 
65
+ # 8.) Run prediction
66
+ probability, prediction, label = classifier.predict_single(results.flatten())
67
 
68
+ return {"probability": probability, "prediction": prediction, "label": label}
 
 
 
 
69
 
70
 
71
  if __name__ == "__main__":
72
  # Example usage (replace with real inputs, place song inside data/raw.)
73
+ data = pd.read_csv("data/raw/predict_data_final.csv")
74
+
75
+ result = []
76
+ label = []
77
+ for row in data.itertuples():
78
+ prediction = predict_pipeline(row.song, row.lyrics)
79
+ result.append(
80
+ {
81
+ "song": row.song,
82
+ "label": row.label,
83
+ "predicted_label": prediction["label"],
84
+ "probability": prediction["probability"],
85
+ }
86
+ )
87
+
88
+ for r in result:
89
+ print(f"Song: {r['song']}")
90
+ print(f"Actual Label: {r['label']}")
91
+ print(f"Predicted: {r['predicted_label']}")
92
+ print(f"Confidence: {r['probability']: .8f}%")
93
+ print("-" * 50)
scripts/train.py CHANGED
@@ -4,14 +4,14 @@ from src.llm2vectrain.model import load_llm2vec_model
4
  from src.llm2vectrain.llm2vec_trainer import l2vec_train
5
  from src.models.mlp import build_mlp, load_config
6
 
7
- from src.utils.config_loader import DATASET_NPZ, PCA_MODEL
8
- from src.utils.dataset import dataset_scaler, dataset_splitter
9
- from sklearn.decomposition import PCA
10
 
11
  from pathlib import Path
 
 
 
12
  import numpy as np
13
  import logging
14
- import joblib
15
 
16
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
17
  logger = logging.getLogger(__name__)
@@ -21,6 +21,10 @@ def train_mlp_model(data : dict):
21
  """
22
  Train the MLP model with extracted features.
23
 
 
 
 
 
24
  Parameters
25
  ----------
26
  data : dict{np.array}
@@ -31,6 +35,11 @@ def train_mlp_model(data : dict):
31
  # Load MLP configuration
32
  config = load_config("config/model_config.yml")
33
 
 
 
 
 
 
34
  # Destructure the dictionary to get data split
35
  X_train, y_train = data["train"]
36
  X_val, y_val = data["val"]
@@ -47,6 +56,7 @@ def train_mlp_model(data : dict):
47
 
48
  # Load best model and evaluate on test set
49
  try:
 
50
  mlp_classifier.load_model("models/mlp/mlp_best.pth")
51
  logger.info("Loaded best model for final evaluation")
52
  except FileNotFoundError:
@@ -55,8 +65,10 @@ def train_mlp_model(data : dict):
55
  # Final evaluation
56
  test_results = mlp_classifier.evaluate(X_test, y_test)
57
 
 
58
  # Save final model
59
  mlp_classifier.save_model("models/mlp/mlp_multimodal.pth")
 
60
 
61
  logger.info("MLP training completed successfully!")
62
  logger.info(f"Final test accuracy: {test_results['test_accuracy']:.2f}%")
@@ -64,6 +76,7 @@ def train_mlp_model(data : dict):
64
  return mlp_classifier
65
 
66
 
 
67
  def train_pipeline():
68
  """
69
  Training script which includes preprocessing, feature extraction, and training the MLP model.
@@ -79,82 +92,114 @@ def train_pipeline():
79
  None
80
  """
81
 
82
- # Instantiate X and Y vectors
83
- X, Y = None, None
 
 
84
 
85
- dataset_path = Path(DATASET_NPZ)
86
 
87
  if dataset_path.exists():
88
  logger.info("Training dataset already exists. Loading file...")
89
 
90
- loaded_data = np.load(DATASET_NPZ)
91
- X = loaded_data["X"]
92
- Y = loaded_data["Y"]
 
 
 
93
  else:
 
94
  logger.info("Training dataset does not exist. Processing data...")
95
  # Get batches from dataset and return full Y labels
96
- batches, Y = dataset_read(batch_size=500)
97
  batch_count = 1
98
 
99
- # Instantiate LLM2Vec and PCA model
100
- llm2vec_model = load_llm2vec_model()
101
-
102
- # Preallocate spaces for both audio and lyric vectors to reduce memory overhead
103
- audio_vectors = np.zeros((len(Y), 384), dtype=np.float32)
104
- lyric_vectors = np.zeros((len(Y), 4096), dtype=np.float32)
105
-
106
- start_idx = 0
107
- for batch in batches:
108
-
109
- logger.info(f"Bulk Preprocessing - Batch {batch_count}.")
110
- audio, lyrics = bulk_preprocessing(batch, batch_count)
111
- batch_count += 1
112
-
113
- # Call the train methods for both SpecTTTra and LLM2Vec
114
- logger.info("Starting SpecTTTra feature extraction...")
115
- audio_features = spectttra_train(audio)
116
-
117
- logger.info("Starting LLM2Vec feature extraction...")
118
- lyrics_features = l2vec_train(llm2vec_model, lyrics)
119
-
120
- batch_size = audio_features.shape[0]
121
-
122
- # Store the results on preallocated spaces
123
- audio_vectors[start_idx:start_idx + batch_size, :] = audio_features
124
- lyric_vectors[start_idx:start_idx + batch_size, :] = lyrics_features
125
-
126
- # Delete stored instance for next batch to remove overhead
127
- del audio, lyrics, audio_features, lyrics_features
128
-
129
- # Run standard scaling on audio and lyrics separately
130
- logger.info("Running standard scaling for audio and lyrics...")
131
- audio_vectors, lyric_vectors = dataset_scaler(audio_vectors, lyric_vectors)
132
-
133
- # Start training the PCA to the collected lyrics features
134
- logger.info("PCA Training on lyric vectors...")
135
- pca = PCA(n_components=256, svd_solver="randomized", random_state=42)
136
- lyric_vectors = pca.fit_transform(lyric_vectors)
137
-
138
- # Save the trained PCA model
139
- joblib.dump(pca, "models/fusion/pca.pkl")
140
-
141
- # Concatenate audio features and reduced lyrics features
142
- X = np.concatenate([audio_vectors, lyric_vectors], axis=1)
143
- logger.info(f"Audio and Lyrics Concatenated. Final features shape: {X.shape}")
144
-
145
- # Convert label list into np.array
146
- Y = np.array(Y)
147
-
148
- # Save both X and Y to an .npz file for easier loading
149
- logger.info("Saving dataset for future testing...")
150
- np.savez(DATASET_NPZ, X=X, Y=Y)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
151
 
152
- # Do data splitting
153
- data = dataset_splitter(X, Y)
 
 
 
 
 
154
 
155
  logger.info("Starting MLP training...")
156
  train_mlp_model(data)
157
 
158
-
159
  if __name__ == "__main__":
160
  train_pipeline()
 
4
  from src.llm2vectrain.llm2vec_trainer import l2vec_train
5
  from src.models.mlp import build_mlp, load_config
6
 
7
+ from src.utils.config_loader import DATASET_NPZ
 
 
8
 
9
  from pathlib import Path
10
+ from src.utils.config_loader import DATASET_NPZ, RAW_DATASET_NPZ
11
+ from src.utils.dataset import scale_pca
12
+
13
  import numpy as np
14
  import logging
 
15
 
16
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
17
  logger = logging.getLogger(__name__)
 
21
  """
22
  Train the MLP model with extracted features.
23
 
24
+ Parameters
25
+ ----------
26
+ data : dict{np.array}
27
+ A dictionary of np.arrays, containing the train/test/val split.
28
  Parameters
29
  ----------
30
  data : dict{np.array}
 
35
  # Load MLP configuration
36
  config = load_config("config/model_config.yml")
37
 
38
+ # Destructure the dictionary to get data split
39
+ X_train, y_train = data["train"]
40
+ X_val, y_val = data["val"]
41
+ X_test, y_test = data["test"]
42
+
43
  # Destructure the dictionary to get data split
44
  X_train, y_train = data["train"]
45
  X_val, y_val = data["val"]
 
56
 
57
  # Load best model and evaluate on test set
58
  try:
59
+ mlp_classifier.load_model("models/mlp/mlp_best.pth")
60
  mlp_classifier.load_model("models/mlp/mlp_best.pth")
61
  logger.info("Loaded best model for final evaluation")
62
  except FileNotFoundError:
 
65
  # Final evaluation
66
  test_results = mlp_classifier.evaluate(X_test, y_test)
67
 
68
+
69
  # Save final model
70
  mlp_classifier.save_model("models/mlp/mlp_multimodal.pth")
71
+ mlp_classifier.save_model("models/mlp/mlp_multimodal.pth")
72
 
73
  logger.info("MLP training completed successfully!")
74
  logger.info(f"Final test accuracy: {test_results['test_accuracy']:.2f}%")
 
76
  return mlp_classifier
77
 
78
 
79
+
80
  def train_pipeline():
81
  """
82
  Training script which includes preprocessing, feature extraction, and training the MLP model.
 
92
  None
93
  """
94
 
95
+ # Set constant sizes
96
+ BATCH_SIZE = 200
97
+ AUDIO_SIZE = 384
98
+ LYRIC_SIZE = 2048
99
 
100
+ dataset_path = Path(RAW_DATASET_NPZ)
101
 
102
  if dataset_path.exists():
103
  logger.info("Training dataset already exists. Loading file...")
104
 
105
+ loaded_data = np.load(RAW_DATASET_NPZ)
106
+ data = {
107
+ "train": (loaded_data["X_train"], loaded_data["y_train"]),
108
+ "test": (loaded_data["X_test"], loaded_data["y_test"]),
109
+ "val": (loaded_data["X_val"], loaded_data["y_val"]),
110
+ }
111
  else:
112
+ logger.info("Training dataset does not exist. Processing data...")
113
  logger.info("Training dataset does not exist. Processing data...")
114
  # Get batches from dataset and return full Y labels
115
+ splits, split_lengths = dataset_read(batch_size=BATCH_SIZE)
116
  batch_count = 1
117
 
118
+ # Instantiate LLM2Vec Model
119
+ l2v = load_llm2vec_model()
120
+
121
+ # Preallocate arrays
122
+ X_train = np.zeros((split_lengths[0], AUDIO_SIZE + LYRIC_SIZE), dtype=np.float32)
123
+ X_test = np.zeros((split_lengths[1], AUDIO_SIZE + LYRIC_SIZE), dtype=np.float32)
124
+ X_val = np.zeros((split_lengths[2], AUDIO_SIZE + LYRIC_SIZE), dtype=np.float32)
125
+
126
+ y_train = np.zeros(split_lengths[0], dtype=np.int32)
127
+ y_test = np.zeros(split_lengths[1], dtype=np.int32)
128
+ y_val = np.zeros(split_lengths[2], dtype=np.int32)
129
+
130
+ X_splits = [X_train, X_test, X_val]
131
+ y_splits = [y_train, y_test, y_val]
132
+
133
+ # Loop through the three splits
134
+ for split_idx, split in enumerate(splits):
135
+ start_idx = 0
136
+
137
+ # Loop through batches for each split
138
+ for batch in split:
139
+ if len(batch) == 0:
140
+ continue # skip empty batch safely
141
+
142
+ logger.info(f"Bulk Preprocessing batch {batch_count}...")
143
+ audio, lyrics = bulk_preprocessing(batch, batch_count)
144
+ batch_labels = batch['target'].values
145
+
146
+ # Extract audio features
147
+ logger.info("Starting SpecTTTra feature extraction...")
148
+ audio_features = spectttra_train(audio)
149
+
150
+ # Call the train method for LLM2Vec
151
+ logger.info(f"\nStarting LLM2Vec feature extraction...")
152
+ lyric_features = l2vec_train(l2v, lyrics)
153
+
154
+ # Concatenate the two features
155
+ batch_feature = np.concatenate([audio_features, lyric_features], axis=1)
156
+
157
+ # Allocate them to the preallocated blocks
158
+ bsz = batch_feature.shape[0]
159
+ X_splits[split_idx][start_idx:start_idx + bsz, :] = batch_feature
160
+ y_splits[split_idx][start_idx:start_idx + bsz] = batch_labels
161
+
162
+ logger.info(f"Batch {batch_count}: {bsz} samples, start_idx={start_idx}")
163
+
164
+ batch_count += 1
165
+ start_idx += bsz
166
+
167
+ # Save raw (unscaled) dataset
168
+ logger.info("Saving raw dataset...")
169
+ np.savez(
170
+ RAW_DATASET_NPZ,
171
+ X_train=X_train, y_train=y_train,
172
+ X_val=X_val, y_val=y_val,
173
+ X_test=X_test, y_test=y_test,
174
+ )
175
+
176
+ # Run scaling
177
+ logger.info("Running standard scaling...")
178
+ data = {
179
+ "train": (X_train, y_train),
180
+ "val": (X_val, y_val),
181
+ "test": (X_test, y_test),
182
+ }
183
+
184
+ # Scale and use PCA fitting for all raw data
185
+ logger.info("Scaling and applying PCA...")
186
+ data = scale_pca(data)
187
+
188
+ # Save scaled dataset
189
+ X_train, y_train = data["train"]
190
+ X_val, y_val = data["val"]
191
+ X_test, y_test = data["test"]
192
 
193
+ logger.info("Saving scaled dataset...")
194
+ np.savez(
195
+ DATASET_NPZ,
196
+ X_train=X_train, y_train=y_train,
197
+ X_val=X_val, y_val=y_val,
198
+ X_test=X_test, y_test=y_test,
199
+ )
200
 
201
  logger.info("Starting MLP training...")
202
  train_mlp_model(data)
203
 
 
204
  if __name__ == "__main__":
205
  train_pipeline()
src/llm2vectrain/llm2vec_trainer.py CHANGED
@@ -1,115 +1,11 @@
1
- from sklearn.decomposition import IncrementalPCA
2
- from sklearn.preprocessing import StandardScaler
3
  from pathlib import Path
 
4
 
5
- import numpy as np
6
- import pickle
7
- import torch
8
- import os
9
  import joblib
10
-
11
- # Initialize PCA and StandardScaler globally for training
12
- _pca_trainer = None
13
-
14
- class SimplePCATrainer:
15
- """
16
- A simple PCA trainer that uses IncrementalPCA to fit data in batches.
17
- It saves checkpoints every 5 batches and can save the final model.
18
-
19
- Args:
20
- None
21
-
22
- Returns:
23
- None
24
-
25
- Attributes:
26
- pca: The IncrementalPCA model.
27
- scaler: StandardScaler for normalizing data.
28
- fitted: Boolean indicating if the model has been initialized.
29
- batch_count_pca: Counter for the number of batches processed.
30
-
31
- Methods:
32
- process_batch(vectors): Processes a batch of vectors, fits the PCA model incrementally.
33
- save_final(model_path): Saves the final PCA model to the specified path.
34
- """
35
-
36
- # Initialize the trainer
37
- def __init__(self):
38
- self.pca = None
39
- self.scaler = StandardScaler()
40
- self.fitted = False
41
- self.batch_count_pca = 0
42
-
43
- def _determine_optimal_components(self, vectors):
44
- """
45
- Determine the optimal number of PCA components to retain 95% variance.
46
-
47
- Args:
48
- vectors: The input data to analyze.
49
- Returns:
50
- n_components: The optimal number of components.
51
- """
52
- temp_pca = IncrementalPCA()
53
- temp_pca.fit(vectors)
54
- cumsum_var = np.cumsum(temp_pca.explained_variance_ratio_)
55
- n_comp_95 = np.argmax(cumsum_var >= 0.95) + 1
56
- return min(n_comp_95, vectors.shape[1] // 2)
57
-
58
- def process_batch(self, vectors):
59
- """
60
- Process a batch of vectors, fitting the PCA model incrementally.
61
-
62
- Args:
63
- vectors: The input data batch to process.
64
- Returns:
65
- reduced_vectors: The PCA-transformed data.
66
-
67
- Note: This method saves a checkpoint every 5 batches.
68
- """
69
- if not self.fitted:
70
- # First batch - initialize everything
71
- n_components = self._determine_optimal_components(vectors)
72
- self.pca = IncrementalPCA(n_components=n_components, batch_size=1000)
73
- self.scaler.fit(vectors)
74
- self.fitted = True
75
- print(f"Initialized PCA with {n_components} components")
76
-
77
- # Process batch
78
- vectors_scaled = self.scaler.transform(vectors)
79
- self.pca.partial_fit(vectors_scaled)
80
- reduced_vectors = self.pca.transform(vectors_scaled)
81
-
82
- self.batch_count_pca += 1
83
-
84
- # Save checkpoint every 5 batches
85
- if self.batch_count_pca % 5 == 0:
86
- os.makedirs("pca_checkpoints", exist_ok=True)
87
- with open(f"pca_checkpoints/checkpoint_batch_{self.batch_count_pca}.pkl", 'wb') as f:
88
- pickle.dump({'pca': self.pca, 'scaler': self.scaler}, f)
89
- print(f"Saved checkpoint at batch {self.batch_count_pca}")
90
-
91
- print(f"Processed batch {self.batch_count_pca}, shape: {vectors.shape} -> {reduced_vectors.shape}")
92
- return reduced_vectors
93
-
94
- def save_final(self, model_path):
95
- """
96
- Save the final PCA model to the specified path.
97
-
98
- Args:
99
- model_path: The file path to save the PCA model.
100
-
101
- Returns:
102
- None
103
-
104
- Note: Change the model path as needed in the data_config.yml file.
105
- """
106
- os.makedirs(os.path.dirname(model_path), exist_ok=True)
107
- with open(model_path, 'wb') as f:
108
- pickle.dump({'pca': self.pca, 'scaler': self.scaler}, f)
109
- print(f"Final model saved to {model_path}. Total variance explained: {np.sum(self.pca.explained_variance_ratio_):.4f}")
110
 
111
  ## For Single Input
112
- def load_pca_model(vectors, model_path="models/fusion/pca.pkl"):
113
  """
114
  Load a pre-trained PCA model and transform the input vectors.
115
 
 
 
 
1
  from pathlib import Path
2
+ from src.utils.config_loader import PCA_MODEL
3
 
 
 
 
 
4
  import joblib
5
+ import torch
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
 
7
  ## For Single Input
8
+ def load_pca_model(vectors, model_path=PCA_MODEL):
9
  """
10
  Load a pre-trained PCA model and transform the input vectors.
11
 
src/llm2vectrain/model.py CHANGED
@@ -16,6 +16,7 @@ def load_llm2vec_model():
16
  tokenizer = AutoTokenizer.from_pretrained(
17
  model_id, padding=True, truncation=True, max_length=512, cache_dir=cache_dir
18
  )
 
19
  config = AutoConfig.from_pretrained(
20
  model_id, trust_remote_code=True, cache_dir=cache_dir
21
  )
 
16
  tokenizer = AutoTokenizer.from_pretrained(
17
  model_id, padding=True, truncation=True, max_length=512, cache_dir=cache_dir
18
  )
19
+
20
  config = AutoConfig.from_pretrained(
21
  model_id, trust_remote_code=True, cache_dir=cache_dir
22
  )
src/models/mlp.py CHANGED
@@ -52,6 +52,7 @@ import torch.nn as nn
52
  import torch.optim as optim
53
  import numpy as np
54
  import yaml
 
55
 
56
  logger = logging.getLogger(__name__)
57
 
@@ -441,7 +442,7 @@ class MLPClassifier:
441
 
442
  return probabilities, predictions
443
 
444
- def predict_single(self, features: np.ndarray) -> Tuple[float, int, str]:
445
  """
446
  Predict whether a single song is AI-generated or human-composed.
447
 
@@ -482,14 +483,19 @@ class MLPClassifier:
482
  f"Expected features for 1 song, got {features.shape[0]} songs. Use predict_batch() instead."
483
  )
484
 
485
- # Use the existing predict method
486
- probabilities, predictions = self.predict(features)
 
 
 
 
 
487
 
488
  # Extract single results
489
- probability = float(probabilities[0])
490
- prediction = int(predictions[0])
491
  label = "Human-Composed" if prediction == 1 else "AI-Generated"
492
-
 
493
  return probability, prediction, label
494
 
495
  def predict_batch(self, features: np.ndarray, return_details: bool = False) -> Dict:
 
52
  import torch.optim as optim
53
  import numpy as np
54
  import yaml
55
+ import torch.nn.functional as F
56
 
57
  logger = logging.getLogger(__name__)
58
 
 
442
 
443
  return probabilities, predictions
444
 
445
+ def predict_single(self, features: np.ndarray, temperature: float = 2.5) -> Tuple[float, int, str]:
446
  """
447
  Predict whether a single song is AI-generated or human-composed.
448
 
 
483
  f"Expected features for 1 song, got {features.shape[0]} songs. Use predict_batch() instead."
484
  )
485
 
486
+ self.model.eval()
487
+ with torch.no_grad():
488
+ features_tensor = torch.FloatTensor(features).to(self.device)
489
+ outputs = self.model(features_tensor)
490
+ logit = torch.logit(outputs.clamp(1e-6, 1 - 1e-6))
491
+ probabilities = torch.sigmoid(logit / temperature).item()
492
+ probabilities = np.clip(probabilities, 0.01, 0.99)
493
 
494
  # Extract single results
495
+ prediction = int(probabilities >= 0.5)
 
496
  label = "Human-Composed" if prediction == 1 else "AI-Generated"
497
+ probability = probabilities*100 if prediction == 1 else (1 - probabilities)*100
498
+
499
  return probability, prediction, label
500
 
501
  def predict_batch(self, features: np.ndarray, return_details: bool = False) -> Dict:
src/musiclime/factorization.py CHANGED
@@ -61,7 +61,20 @@ class OpenUnmixFactorization:
61
 
62
  def _separate_sources(self):
63
  waveform = np.expand_dims(self.audio, axis=1)
64
- prediction = predict.separate(torch.as_tensor(waveform).float(), rate=44100)
 
 
 
 
 
 
 
 
 
 
 
 
 
65
 
66
  components = [prediction[key][0].mean(dim=0).numpy() for key in prediction]
67
  names = list(prediction.keys())
 
61
 
62
  def _separate_sources(self):
63
  waveform = np.expand_dims(self.audio, axis=1)
64
+
65
+ # Load openunmix .pth files from local dir
66
+ model_path = "models/musiclime"
67
+
68
+ # Specify targets
69
+ targets = ["vocals", "bass", "drums", "other"]
70
+
71
+ # Then load openunmix files to openunmix' method
72
+ prediction = predict.separate(
73
+ torch.as_tensor(waveform).float(),
74
+ rate=44100,
75
+ model_str_or_path=model_path,
76
+ targets=targets,
77
+ )
78
 
79
  components = [prediction[key][0].mean(dim=0).numpy() for key in prediction]
80
  names = list(prediction.keys())
src/musiclime/wrapper.py CHANGED
@@ -38,8 +38,6 @@ class MusicLIMEPredictor:
38
  processed_lyrics = []
39
 
40
  for i, (text, audio) in enumerate(zip(texts, audios)):
41
- # if i % 100 == 0:
42
- # print(f" Preprocessing {i+1}/{len(texts)}")
43
  processed_audio, processed_lyric = single_preprocessing(audio, text)
44
  processed_audios.append(processed_audio)
45
  processed_lyrics.append(processed_lyric)
@@ -74,44 +72,49 @@ class MusicLIMEPredictor:
74
  )
75
  )
76
 
77
- # Step 3: Apply PCA to lyrics batch first
78
  start_time = time.time()
79
- print("[MusicLIME] Applying PCA to lyrics (batch)")
80
- pca_model = joblib.load("models/fusion/pca.pkl")
81
- reduced_lyrics_batch = pca_model.transform(
82
- lyrics_features_batch
83
- ) # (batch, 256)
84
- pca_time = time.time() - start_time
85
- print(green_bold(f"[MusicLIME] PCA completed in {pca_time:.2f}s"))
86
 
87
- # Step 4: Scale the reduced features
88
- start_time = time.time()
89
- print("[MusicLIME] Scaling features (batch)...")
90
  audio_scaler = joblib.load("models/fusion/audio_scaler.pkl")
91
- lyric_scaler = joblib.load("models/fusion/lyric_scaler.pkl")
92
 
 
93
  scaled_audio_batch = audio_scaler.transform(
94
  audio_features_batch
95
  ) # (batch, 384)
96
  scaled_lyrics_batch = lyric_scaler.transform(
 
 
 
 
 
 
 
 
 
 
 
 
97
  reduced_lyrics_batch
98
- ) # (batch, 256)
99
 
100
- # Step 5: Concatenate features
101
  combined_features_batch = np.concatenate(
102
- [scaled_audio_batch, scaled_lyrics_batch], axis=1
103
- )
104
  scaling_time = time.time() - start_time
105
  print(green_bold(f"[MusicLIME] Scaling completed in {scaling_time:.2f}s"))
106
 
107
- # Step 6: Batch MLP prediction
108
  start_time = time.time()
109
  print("[MusicLIME] Running MLP predictions (batch)...")
110
  if self.classifier is None:
111
  self.classifier = build_mlp(
112
  input_dim=combined_features_batch.shape[1], config=self.config
113
  )
114
- self.classifier.load_model("models/mlp/mlp_multimodal.pth")
115
 
116
  probabilities, predictions = self.classifier.predict(combined_features_batch)
117
 
@@ -122,17 +125,12 @@ class MusicLIMEPredictor:
122
 
123
  # Total time summary
124
  total_time = (
125
- preprocessing_time
126
- + audio_time
127
- + lyrics_time
128
- + pca_time
129
- + scaling_time
130
- + mlp_time
131
  )
132
  print(f"[MusicLIME] Batch processing complete!")
133
  print(
134
  green_bold(
135
- f"[MusicLIME] Total time: {total_time:.2f}s (Preprocessing: {preprocessing_time:.2f}s, Audio: {audio_time:.2f}s, Lyrics: {lyrics_time:.2f}s, PCA: {pca_time:.2f}s, Scaling: {scaling_time:.2f}s, MLP: {mlp_time:.2f}s)"
136
  )
137
  )
138
 
 
38
  processed_lyrics = []
39
 
40
  for i, (text, audio) in enumerate(zip(texts, audios)):
 
 
41
  processed_audio, processed_lyric = single_preprocessing(audio, text)
42
  processed_audios.append(processed_audio)
43
  processed_lyrics.append(processed_lyric)
 
72
  )
73
  )
74
 
75
+ # Step 3: Scale and reduce in batch
76
  start_time = time.time()
77
+ print("[MusicLIME] Scaling and reducing features (batch)...")
 
 
 
 
 
 
78
 
79
+ # Load the trained scalers
 
 
80
  audio_scaler = joblib.load("models/fusion/audio_scaler.pkl")
81
+ lyric_scaler = joblib.load("models/fusion/lyrics_scaler.pkl")
82
 
83
+ # Then apply scaling to the batch
84
  scaled_audio_batch = audio_scaler.transform(
85
  audio_features_batch
86
  ) # (batch, 384)
87
  scaled_lyrics_batch = lyric_scaler.transform(
88
+ lyrics_features_batch
89
+ ) # (batch, 2048)
90
+
91
+ # Step 4: Apply PCA to lyrics batch
92
+ print("[MusicLIME] Applying PCA to lyrics (batch)")
93
+ pca_model = joblib.load("models/fusion/pca.pkl")
94
+ reduced_lyrics_batch = pca_model.transform(scaled_lyrics_batch) # (batch, 512)
95
+
96
+ # Step 5: Apply scaler to PCA-scaled lyrics batch
97
+ print("[MusicLIME] Reapplying scaler to PCA-scaled batch")
98
+ pca_scaler = joblib.load("models/fusion/pca_scaler.pkl")
99
+ reduced_lyrics_batch = pca_scaler.transform(
100
  reduced_lyrics_batch
101
+ ) # (batch, 512)
102
 
103
+ # Step 6: Concatenate features
104
  combined_features_batch = np.concatenate(
105
+ [scaled_audio_batch, reduced_lyrics_batch], axis=1
106
+ ) # (batch, sum of lyrics & audio vector dims)
107
  scaling_time = time.time() - start_time
108
  print(green_bold(f"[MusicLIME] Scaling completed in {scaling_time:.2f}s"))
109
 
110
+ # Step 7: Batch MLP prediction
111
  start_time = time.time()
112
  print("[MusicLIME] Running MLP predictions (batch)...")
113
  if self.classifier is None:
114
  self.classifier = build_mlp(
115
  input_dim=combined_features_batch.shape[1], config=self.config
116
  )
117
+ self.classifier.load_model("models/mlp/mlp_best.pth")
118
 
119
  probabilities, predictions = self.classifier.predict(combined_features_batch)
120
 
 
125
 
126
  # Total time summary
127
  total_time = (
128
+ preprocessing_time + audio_time + lyrics_time + scaling_time + mlp_time
 
 
 
 
 
129
  )
130
  print(f"[MusicLIME] Batch processing complete!")
131
  print(
132
  green_bold(
133
+ f"[MusicLIME] Total time: {total_time:.2f}s (Preprocessing: {preprocessing_time:.2f}s, Audio: {audio_time:.2f}s, Lyrics: {lyrics_time:.2f}s, Scaling: {scaling_time:.2f}s, MLP: {mlp_time:.2f}s)"
134
  )
135
  )
136
 
src/preprocessing/audio_preprocessor.py CHANGED
@@ -39,7 +39,7 @@ class AudioPreprocessor:
39
 
40
  """
41
 
42
- def __init__(self, script="train", waveform_norm="std"):
43
  self.SCRIPT = script
44
  self.INPUT_SAMPLING = 48000
45
  self.TARGET_SAMPLING = 16000
@@ -71,7 +71,27 @@ class AudioPreprocessor:
71
  audiofile = f"{audiofile}.mp3"
72
  file = self.INPUT_PATH / audiofile
73
 
74
- y, sr = librosa.load(str(file), sr=None, mono=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
  elif isinstance(audiofile, (bytes, io.BytesIO)):
77
  file = (
@@ -90,13 +110,20 @@ class AudioPreprocessor:
90
  else:
91
  raise ValueError(f"Unsupported audiofile type: {type(audiofile)}")
92
 
93
- # Ensure consistent shape (channels, num_samples)
94
- if y.ndim == 1: # mono
95
- y = y[None, :] # (1, num_samples)
 
 
 
 
 
 
96
  else:
97
- y = y.T # librosa returns (num_samples, channels)
98
 
99
  waveform = torch.from_numpy(y).float()
 
100
  return waveform, sr
101
 
102
  except Exception as e:
@@ -182,7 +209,11 @@ class AudioPreprocessor:
182
  waveform : tensor
183
  Normalized audio waveform.
184
  """
185
- if method == "std":
 
 
 
 
186
  std = waveform.std()
187
  return waveform / max(std, 1e-6)
188
  elif method == "minmax":
@@ -202,7 +233,7 @@ class AudioPreprocessor:
202
  Base filename to use.
203
  """
204
  self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
205
- print(f"Saving {filename} to {self.OUTPUT_PATH}.")
206
 
207
  output_path = self.OUTPUT_PATH / f"{filename}"
208
 
@@ -233,7 +264,7 @@ class AudioPreprocessor:
233
 
234
  # Convert the audio into mono
235
  if waveform.shape[0] > 1:
236
- print("Current audio is stereo. Converting to mono.")
237
  waveform = waveform.mean(dim=0, keepdim=True)
238
 
239
  # If there is a skip value provided, trim it
@@ -245,7 +276,7 @@ class AudioPreprocessor:
245
  # Trim if more than 120 seconds, pad if less than
246
  waveform = self.pad_trim(waveform=waveform, random_crop=train)
247
 
248
- # Normalize waveform (aligned with SONICS)
249
  waveform = self.normalize_waveform(waveform, method=self.WAVEFORM_NORM)
250
 
251
  # Add some gaussian noise to the waveform during training
 
39
 
40
  """
41
 
42
+ def __init__(self, script="train", waveform_norm="peak"):
43
  self.SCRIPT = script
44
  self.INPUT_SAMPLING = 48000
45
  self.TARGET_SAMPLING = 16000
 
71
  audiofile = f"{audiofile}.mp3"
72
  file = self.INPUT_PATH / audiofile
73
 
74
+ # FIXED: Force librosa to load properly
75
+ # Load at native sample rate first, then we will resample later
76
+ y, sr = librosa.load(str(file), sr=None, mono=False, dtype=np.float32)
77
+
78
+ # If loading fails (all zeros), try with explicit sample rate
79
+ if np.abs(y).max() < 0.0001:
80
+ print(f"Warning: First load failed, trying with sr=48000")
81
+ y, sr = librosa.load(
82
+ str(file), sr=48000, mono=False, dtype=np.float32
83
+ )
84
+
85
+ # Last resort: use soundfile instead
86
+ if np.abs(y).max() < 0.0001:
87
+ print(f"Warning: Librosa failed, trying soundfile")
88
+ import soundfile as sf
89
+
90
+ y, sr = sf.read(str(file), dtype="float32")
91
+ if y.ndim == 2:
92
+ y = y.T # soundfile returns (samples, channels)
93
+ else:
94
+ y = y[None, :] # make it (1, samples)
95
 
96
  elif isinstance(audiofile, (bytes, io.BytesIO)):
97
  file = (
 
110
  else:
111
  raise ValueError(f"Unsupported audiofile type: {type(audiofile)}")
112
 
113
+ # Verify we actually loaded audio
114
+ if np.abs(y).max() < 0.0001:
115
+ raise RuntimeError(
116
+ f"Audio file appears to be silent or corrupted: {audiofile}"
117
+ )
118
+
119
+ # Ensure consistent shape
120
+ if y.ndim == 1:
121
+ y = y[None, :]
122
  else:
123
+ y = y.T if y.shape[0] > y.shape[1] else y
124
 
125
  waveform = torch.from_numpy(y).float()
126
+
127
  return waveform, sr
128
 
129
  except Exception as e:
 
209
  waveform : tensor
210
  Normalized audio waveform.
211
  """
212
+ if method == "peak":
213
+ # Normalize to [-1, 1] based on max absolute value to preserves relative dynamics
214
+ peak = waveform.abs().max()
215
+ return waveform / max(peak, 1e-6)
216
+ elif method == "std":
217
  std = waveform.std()
218
  return waveform / max(std, 1e-6)
219
  elif method == "minmax":
 
233
  Base filename to use.
234
  """
235
  self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
236
+ # print(f"Saving {filename} to {self.OUTPUT_PATH}.")
237
 
238
  output_path = self.OUTPUT_PATH / f"{filename}"
239
 
 
264
 
265
  # Convert the audio into mono
266
  if waveform.shape[0] > 1:
267
+ # print("Current audio is stereo. Converting to mono.")
268
  waveform = waveform.mean(dim=0, keepdim=True)
269
 
270
  # If there is a skip value provided, trim it
 
276
  # Trim if more than 120 seconds, pad if less than
277
  waveform = self.pad_trim(waveform=waveform, random_crop=train)
278
 
279
+ # Normalize waveform (used PEAK)
280
  waveform = self.normalize_waveform(waveform, method=self.WAVEFORM_NORM)
281
 
282
  # Add some gaussian noise to the waveform during training
src/preprocessing/preprocessor.py CHANGED
@@ -1,5 +1,6 @@
1
  import pandas as pd
2
  import numpy as np
 
3
 
4
  from src.preprocessing.audio_preprocessor import AudioPreprocessor
5
  from src.preprocessing.lyrics_preprocessor import LyricsPreprocessor
@@ -51,6 +52,43 @@ def bulk_preprocessing(batch: pd.DataFrame, batch_count: int):
51
  return audio_list, lyric_list
52
 
53
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
  def single_preprocessing(audio, lyric: str):
55
  """
56
  Preprocesses a single record of audio and lyric data
@@ -82,26 +120,46 @@ def single_preprocessing(audio, lyric: str):
82
  return processed_song, processed_lyric
83
 
84
 
85
- def dataset_read(batch_size = 20):
86
  """
87
- Reads the csv file and returns batches of data
 
88
 
89
  Parameters
90
  ----------
91
- None
 
92
 
93
  Returns
94
  -------
95
- data_splits : list
96
- List of dataframes acting as batches
97
-
98
- label : list
99
- List of real/fake labels (in the formm of 0 and 1)
100
  """
101
  dataset = pd.read_csv(DATASET_CSV)
102
- label = dataset['target'].tolist()
103
 
104
- # Split into x batches (50,000 / x)
105
- data_splits = np.array_split(dataset, batch_size)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
106
 
107
- return data_splits, label
 
1
  import pandas as pd
2
  import numpy as np
3
+ import math
4
 
5
  from src.preprocessing.audio_preprocessor import AudioPreprocessor
6
  from src.preprocessing.lyrics_preprocessor import LyricsPreprocessor
 
52
  return audio_list, lyric_list
53
 
54
 
55
+ def bulk_preprocessing_lyrics(batch: pd.DataFrame, batch_count: int):
56
+ """
57
+ Applies lyrics preprocessing to a training batch
58
+
59
+ Parameters
60
+ ----------
61
+ batch : pd.dataframe
62
+ Dataframe containing the batch data.
63
+
64
+ batch_count : int
65
+ Batch count value.
66
+
67
+ Returns
68
+ -------
69
+ lyric_list : list
70
+ List of loaded lyrics in string form.
71
+ """
72
+
73
+ lyric_preprocessor = LyricsPreprocessor()
74
+
75
+ lyric_list = []
76
+ count, batch_length = 1, len(batch)
77
+
78
+ print(f"Preprocessing training data with length {batch_length}\n")
79
+
80
+ for row in batch.itertuples():
81
+ print(f"Batch {batch_count} - {count}/{batch_length}")
82
+
83
+ # Preprocess lyric and append to lyric list
84
+ processed_lyric = lyric_preprocessor(lyrics=row.lyrics)
85
+ lyric_list.append(processed_lyric)
86
+
87
+ count += 1
88
+
89
+ return lyric_list
90
+
91
+
92
  def single_preprocessing(audio, lyric: str):
93
  """
94
  Preprocesses a single record of audio and lyric data
 
120
  return processed_song, processed_lyric
121
 
122
 
123
+ def dataset_read(batch_size=20):
124
  """
125
+ Reads the main dataset, splits it into the train/test/valid split, and computes
126
+ optimal number of samples per batch.
127
 
128
  Parameters
129
  ----------
130
+ batch_size : int
131
+ Number of data per batch
132
 
133
  Returns
134
  -------
135
+ split: list[splits]
136
+ A collection of the three splits
137
+
138
+ split_lengths : list[int]
139
+ List of the split lengths
140
  """
141
  dataset = pd.read_csv(DATASET_CSV)
 
142
 
143
+ train = dataset[dataset["split"] == "train"]
144
+ test = dataset[dataset["split"] == "test"]
145
+ val = dataset[dataset["split"] == "valid"]
146
+
147
+ # Find the minimum split size (ignoring empty splits)
148
+ min_split_size = min([len(train), len(test), len(val)])
149
+ # Clamp batch_size so it never exceeds the smallest split
150
+ effective_batch_size = min(batch_size, min_split_size if min_split_size > 0 else batch_size)
151
+
152
+ def make_splits(df, batch_size):
153
+ if len(df) == 0:
154
+ return []
155
+ n_splits = math.ceil(len(df) / batch_size)
156
+ return np.array_split(df, n_splits)
157
+
158
+ train_splits = make_splits(train, effective_batch_size)
159
+ test_splits = make_splits(test, effective_batch_size)
160
+ val_splits = make_splits(val, effective_batch_size)
161
+
162
+ splits = [train_splits, test_splits, val_splits]
163
+ split_lengths = [len(train), len(test), len(val)]
164
 
165
+ return splits, split_lengths
src/spectttra/spectttra.py CHANGED
@@ -1,6 +1,9 @@
 
1
  import torch.nn as nn
 
2
  from .transformer import Transformer
3
  from .tokenizer import STTokenizer
 
4
 
5
 
6
  class SpecTTTra(nn.Module):
@@ -112,4 +115,99 @@ class SpecTTTra(nn.Module):
112
  # Transformer
113
  output = self.transformer(spectro_temporal_tokens) # shape: (B, T/t + F/f, dim)
114
 
115
- return output
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
  import torch.nn as nn
3
+ from pathlib import Path
4
  from .transformer import Transformer
5
  from .tokenizer import STTokenizer
6
+ from src.spectttra.feature import FeatureExtractor
7
 
8
 
9
  class SpecTTTra(nn.Module):
 
115
  # Transformer
116
  output = self.transformer(spectro_temporal_tokens) # shape: (B, T/t + F/f, dim)
117
 
118
+ return output
119
+
120
+
121
+ def build_spectttra_from_cfg(cfg, device):
122
+ """
123
+ Constructs the SpecTTTra model and its associated FeatureExtractor from a given configuration.
124
+
125
+ Args:
126
+ cfg (SimpleNamespace): Configuration object containing model and feature extraction parameters. Expected attributes include:
127
+ - cfg.melspec.n_mels: Number of mel frequency bins.
128
+ - cfg.model: Model-specific parameters (e.g., embed_dim, t_clip, f_clip, etc.).
129
+ device (torch.device): The device on which the model and feature extractor will be allocated (e.g., 'cpu' or 'cuda').
130
+
131
+ Returns:
132
+ tuple:
133
+ FeatureExtractor: Initialized feature extraction module moved to the specified device.
134
+ SpecTTTra: Constructed SpecTTTra model moved to the specified device.
135
+ """
136
+
137
+ feat_ext = FeatureExtractor(cfg).to(device)
138
+
139
+ # The pre-trained model expects specific, fixed input dimensions.
140
+ # Hardcoded to ensure the model architecture matches the checkpoint weights exactly.
141
+ # The expected number of frames (n_frames) is taken directly from the RuntimeError message.
142
+ n_mels = cfg.melspec.n_mels # n_mels should be 128
143
+ n_frames = 3744 # n_frames match the checkpoint's expectation
144
+
145
+ print(f"[INFO] Initializing SpecTTTra with fixed dimensions: n_mels={n_mels}, n_frames={n_frames}")
146
+
147
+ model_cfg = cfg.model
148
+ model = SpecTTTra(
149
+ input_spec_dim=n_mels,
150
+ input_temp_dim=n_frames,
151
+ embed_dim=model_cfg.embed_dim,
152
+ t_clip=model_cfg.t_clip,
153
+ f_clip=model_cfg.f_clip,
154
+ num_heads=model_cfg.num_heads,
155
+ num_layers=model_cfg.num_layers,
156
+ pre_norm=model_cfg.pre_norm,
157
+ pe_learnable=model_cfg.pe_learnable,
158
+ pos_drop_rate=model_cfg.pos_drop_rate,
159
+ attn_drop_rate=model_cfg.attn_drop_rate,
160
+ proj_drop_rate=model_cfg.proj_drop_rate,
161
+ mlp_ratio=model_cfg.mlp_ratio,
162
+ ).to(device)
163
+
164
+ return feat_ext, model
165
+
166
+
167
+ def load_frozen_spectttra(model, ckpt_path, device):
168
+ """
169
+ Loads pretrained SpecTTTra weights from a frozen checkpoint file.
170
+
171
+ Args:
172
+ model (torch.nn.Module): An initialized SpecTTTra model instance to load weights into.
173
+ ckpt_path (str or Path): Path to the pretrained model checkpoint file (e.g., 'spectttra_frozen.pth').
174
+ device (torch.device): The device to map the loaded weights to (e.g., 'cpu' or 'cuda').
175
+
176
+ Returns:
177
+ model (torch.nn.Module): The SpecTTTra model with loaded pretrained weights, set to evaluation mode.
178
+
179
+ Raises:
180
+ FileNotFoundError: If the specified checkpoint file does not exist at `ckpt_path`.
181
+ """
182
+ ckpt_path = Path(ckpt_path)
183
+ if not ckpt_path.exists():
184
+ raise FileNotFoundError(
185
+ f"Pre-trained model not found at {ckpt_path}. "
186
+ "Please download 'pytorch_model.bin', rename to 'spectttra_frozen.pth', "
187
+ "and place it in the correct directory."
188
+ )
189
+
190
+ print(f"[INFO] Found SpecTTTra checkpoint at {ckpt_path}. Loading weights...")
191
+ state = torch.load(ckpt_path, map_location=device)
192
+
193
+ new_state_dict = {}
194
+ for k, v in state.items():
195
+ if k.startswith("encoder."):
196
+ new_key = k[len("encoder."):]
197
+ new_state_dict[new_key] = v
198
+ else:
199
+ new_state_dict[k] = v
200
+
201
+ # Now that the shapes match, this should load without a size mismatch error.
202
+ missing_keys, unexpected_keys = model.load_state_dict(new_state_dict, strict=False)
203
+ if missing_keys:
204
+ # Might see a few missing keys if your SpecTTTra class is slightly different, but the core should load.
205
+ print(f"[WARNING] Missing keys in model: {missing_keys}")
206
+ if unexpected_keys:
207
+ # Seeing 'classifier' or 'ft_extractor' keys here is NORMAL and SAFE.
208
+ print(f"[INFO] Unused keys in checkpoint: {unexpected_keys}")
209
+
210
+ print("[INFO] Successfully loaded pre-trained SpecTTTra weights.")
211
+
212
+ model.eval()
213
+ return model
src/spectttra/spectttra_trainer.py CHANGED
@@ -1,11 +1,10 @@
1
  import threading
2
  import torch
3
  import numpy as np
4
- from pathlib import Path
5
  from types import SimpleNamespace
6
 
7
  from src.spectttra.feature import FeatureExtractor
8
- from src.spectttra.spectttra import SpecTTTra
9
 
10
  # Shared variables for the model and setup, loaded only once and reused (cache)
11
  _PREDICTOR_LOCK = threading.Lock()
@@ -17,54 +16,10 @@ _DEVICE = None
17
 
18
  def build_spectttra(cfg, device):
19
  """
20
- Initialize SpecTTTra and FeatureExtractor modules, and load a frozen checkpoint.
21
-
22
- Args:
23
- cfg (SimpleNamespace): Configuration containing audio, mel-spectrogram, and model parameters.
24
- device (torch.device): Target device for model and feature extractor.
25
-
26
- Returns:
27
- tuple:
28
- FeatureExtractor: Module for converting raw audio into mel-spectrogram features.
29
- SpecTTTra: Spectro-temporal transformer model initialized with checkpoint weights.
30
  """
31
- feat_ext = FeatureExtractor(cfg).to(device)
32
-
33
- # Build model once using placeholder input to infer mel and frame dimensions
34
- with torch.no_grad():
35
- dummy_wave = torch.zeros(1, cfg.audio.max_len, device=device)
36
- dummy_mel = feat_ext(dummy_wave.float())
37
- _, n_mels, n_frames = dummy_mel.shape
38
-
39
- model_cfg = cfg.model
40
- model = SpecTTTra(
41
- input_spec_dim=n_mels,
42
- input_temp_dim=n_frames,
43
- embed_dim=model_cfg.embed_dim,
44
- t_clip=model_cfg.t_clip,
45
- f_clip=model_cfg.f_clip,
46
- num_heads=model_cfg.num_heads,
47
- num_layers=model_cfg.num_layers,
48
- pre_norm=model_cfg.pre_norm,
49
- pe_learnable=model_cfg.pe_learnable,
50
- pos_drop_rate=model_cfg.pos_drop_rate,
51
- attn_drop_rate=model_cfg.attn_drop_rate,
52
- proj_drop_rate=model_cfg.proj_drop_rate,
53
- mlp_ratio=model_cfg.mlp_ratio,
54
- ).to(device)
55
-
56
- # Load frozen checkpoint if it exists; otherwise, save initial state
57
- ckpt_path = Path("models/spectttra/spectttra_frozen.pth")
58
- if ckpt_path.exists():
59
- state = torch.load(ckpt_path, map_location=device)
60
- model.load_state_dict(state)
61
- print(f"[INFO] Loaded frozen SpecTTTra checkpoint from {ckpt_path}")
62
- else:
63
- ckpt_path.parent.mkdir(parents=True, exist_ok=True)
64
- torch.save(model.state_dict(), ckpt_path)
65
- print(f"[INFO] Saved frozen SpecTTTra checkpoint to {ckpt_path}")
66
-
67
- model.eval()
68
  return feat_ext, model
69
 
70
 
@@ -118,20 +73,14 @@ def _init_predictor_once():
118
  )
119
 
120
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
121
-
122
  feat_ext, model = build_spectttra(cfg, device)
123
-
124
  feat_ext.to(device)
125
 
126
  # Move model to device (GPU if available) and allow faster inference with mixed precision
127
- model.to(device)
128
- model.eval()
129
 
130
  # Cache
131
- _FEAT_EXT = feat_ext
132
- _MODEL = model
133
- _CFG = cfg
134
- _DEVICE = device
135
 
136
 
137
  def spectttra_predict(audio_tensor):
@@ -147,6 +96,7 @@ def spectttra_predict(audio_tensor):
147
  1D embedding vector of shape (embed_dim,). The embedding is obtained
148
  by mean-pooling the transformer token outputs.
149
  """
 
150
  global _FEAT_EXT, _MODEL, _CFG, _DEVICE
151
 
152
  _init_predictor_once()
@@ -161,18 +111,25 @@ def spectttra_predict(audio_tensor):
161
 
162
  with torch.no_grad():
163
  # Extract mel-spectrogram
164
- melspec = feat_ext(waveform) # (B, n_mels, n_frames)
 
 
 
 
 
 
 
 
165
 
166
  if device.type == "cuda":
167
  with torch.cuda.amp.autocast(enabled=True):
168
- tokens = model(melspec) # (B, num_tokens, embed_dim)
169
- pooled = tokens.mean(dim=1) # (B, embed_dim)
170
  else:
171
  tokens = model(melspec)
172
  pooled = tokens.mean(dim=1)
173
 
174
- # Return numpy vector
175
- out = pooled.squeeze(0).cpu().numpy() # (embed_dim,)
176
  return out
177
 
178
 
@@ -203,19 +160,31 @@ def spectttra_train(audio_tensors):
203
  model = _MODEL
204
  device = _DEVICE
205
 
206
- batch = []
207
- for waveform in audio_tensors:
208
- with torch.no_grad():
209
- melspec = feat_ext(waveform.float()) # (B, n_mels, n_frames)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
210
 
211
- if device.type == "cuda":
212
- with torch.cuda.amp.autocast(enabled=True):
213
- tokens = model(melspec) # (B, num_tokens, embed_dim)
214
- pooled = tokens.mean(dim=1) # (B, embed_dim)
215
- else:
216
  tokens = model(melspec)
217
  pooled = tokens.mean(dim=1)
218
-
219
- batch.append(pooled.cpu().numpy())
 
220
 
221
- return np.vstack(batch)
 
1
  import threading
2
  import torch
3
  import numpy as np
 
4
  from types import SimpleNamespace
5
 
6
  from src.spectttra.feature import FeatureExtractor
7
+ from src.spectttra.spectttra import SpecTTTra, build_spectttra_from_cfg, load_frozen_spectttra
8
 
9
  # Shared variables for the model and setup, loaded only once and reused (cache)
10
  _PREDICTOR_LOCK = threading.Lock()
 
16
 
17
  def build_spectttra(cfg, device):
18
  """
19
+ Wrapper that builds SpecTTTra + FeatureExtractor and loads frozen checkpoint.
 
 
 
 
 
 
 
 
 
20
  """
21
+ feat_ext, model = build_spectttra_from_cfg(cfg, device)
22
+ model = load_frozen_spectttra(model, "models/spectttra/spectttra_frozen.pth", device)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
23
  return feat_ext, model
24
 
25
 
 
73
  )
74
 
75
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 
76
  feat_ext, model = build_spectttra(cfg, device)
 
77
  feat_ext.to(device)
78
 
79
  # Move model to device (GPU if available) and allow faster inference with mixed precision
80
+ model.to(device).eval()
 
81
 
82
  # Cache
83
+ _FEAT_EXT, _MODEL, _CFG, _DEVICE = feat_ext, model, cfg, device
 
 
 
84
 
85
 
86
  def spectttra_predict(audio_tensor):
 
96
  1D embedding vector of shape (embed_dim,). The embedding is obtained
97
  by mean-pooling the transformer token outputs.
98
  """
99
+
100
  global _FEAT_EXT, _MODEL, _CFG, _DEVICE
101
 
102
  _init_predictor_once()
 
111
 
112
  with torch.no_grad():
113
  # Extract mel-spectrogram
114
+ melspec = feat_ext(waveform)
115
+
116
+ # Ensure melspec shape matches model's expectation ---
117
+ expected_frames = model.input_temp_dim # expected_frames is 3744
118
+ if melspec.shape[2] > expected_frames:
119
+ melspec = melspec[:, :, :expected_frames]
120
+ elif melspec.shape[2] < expected_frames:
121
+ padding = expected_frames - melspec.shape[2]
122
+ melspec = torch.nn.functional.pad(melspec, (0, padding))
123
 
124
  if device.type == "cuda":
125
  with torch.cuda.amp.autocast(enabled=True):
126
+ tokens = model(melspec)
127
+ pooled = tokens.mean(dim=1)
128
  else:
129
  tokens = model(melspec)
130
  pooled = tokens.mean(dim=1)
131
 
132
+ out = pooled.squeeze(0).cpu().numpy()
 
133
  return out
134
 
135
 
 
160
  model = _MODEL
161
  device = _DEVICE
162
 
163
+ # Refactors the loop to be a much faster single-batch operation
164
+ try:
165
+ waveforms_batch = torch.cat(audio_tensors, dim=0).to(device).float()
166
+ except Exception as e:
167
+ print(f"[INFO] Error during tensor concatenation, falling back to loop. Fix preprocessing for speed. Error: {e}")
168
+ batch_list = [spectttra_predict(w) for w in audio_tensors]
169
+ return np.array(batch_list)
170
+
171
+ with torch.no_grad():
172
+ melspec = feat_ext(waveforms_batch)
173
+
174
+ # Ensure melspec shape matches model's expectation
175
+ expected_frames = model.input_temp_dim # expected_frames is 3744
176
+ if melspec.shape[2] > expected_frames:
177
+ melspec = melspec[:, :, :expected_frames]
178
+ elif melspec.shape[2] < expected_frames:
179
+ padding = expected_frames - melspec.shape[2]
180
+ melspec = torch.nn.functional.pad(melspec, (0, padding))
181
 
182
+ if device.type == "cuda":
183
+ with torch.cuda.amp.autocast(enabled=True):
 
 
 
184
  tokens = model(melspec)
185
  pooled = tokens.mean(dim=1)
186
+ else:
187
+ tokens = model(melspec)
188
+ pooled = tokens.mean(dim=1)
189
 
190
+ return pooled.cpu().numpy()
src/utils/config_loader.py CHANGED
@@ -9,7 +9,11 @@ BASE_DIR = Path(config["base_dir"]).resolve()
9
 
10
  # Resolve paths
11
  DATASET_NPZ = BASE_DIR / config["paths"]["dataset_npz"]
 
12
  DATASET_CSV = BASE_DIR / config["paths"]["dataset_csv"]
13
  RAW_DIR = BASE_DIR / config["paths"]["raw_dir"]
14
  PROCESSED_DIR = BASE_DIR / config["paths"]["processed_dir"]
15
- PCA_MODEL = BASE_DIR / config["paths"]["pca_path"]
 
 
 
 
9
 
10
  # Resolve paths
11
  DATASET_NPZ = BASE_DIR / config["paths"]["dataset_npz"]
12
+ RAW_DATASET_NPZ = BASE_DIR / config["paths"]["raw_dataset_npz"]
13
  DATASET_CSV = BASE_DIR / config["paths"]["dataset_csv"]
14
  RAW_DIR = BASE_DIR / config["paths"]["raw_dir"]
15
  PROCESSED_DIR = BASE_DIR / config["paths"]["processed_dir"]
16
+ PCA_MODEL = BASE_DIR / config["paths"]["pca_path"]
17
+ AUDIO_SCALER = BASE_DIR / config["paths"]["audio_scaler"]
18
+ LYRICS_SCALER = BASE_DIR / config["paths"]["lyrics_scaler"]
19
+ PCA_SCALER = BASE_DIR / config["paths"]["pca_scaler"]
src/utils/dataset.py CHANGED
@@ -1,45 +1,132 @@
1
- from sklearn.preprocessing import StandardScaler
2
  from sklearn.model_selection import train_test_split
 
 
 
3
 
4
  import joblib
5
  import numpy as np
6
  import logging
 
7
 
8
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
9
  logger = logging.getLogger(__name__)
10
 
11
 
12
- def dataset_splitter(X: np.ndarray, Y: np.ndarray):
13
  """
14
- Script that splits the X and Y values to train, test, and valid splits.
 
15
 
16
  Parameters
17
  ----------
18
  X : np.array
19
- Array of feature vectors
20
  Y : np.array
21
- Array of labels (real or fake)
 
 
 
 
 
 
22
 
23
  Returns
24
  -------
25
- data : dict{np.array}
26
- A dictionary of np.arrays, containing the train/test/val split.
 
27
  """
28
 
29
  logger.info(f"Dataset shape: {X.shape}, Labels: {len(Y)}")
30
  logger.info(f"Class distribution: {np.bincount(Y)}")
31
 
32
- # Split the data into train/val/test
33
  X_train, X_test, y_train, y_test = train_test_split(
34
- X, Y, test_size=0.1, random_state=42, stratify=Y
35
  )
36
-
 
37
  X_train, X_val, y_train, y_val = train_test_split(
38
  X_train, y_train, test_size=0.2222, random_state=42, stratify=y_train
39
  )
40
-
41
  logger.info(f"Train: {X_train.shape}, Validation: {X_val.shape}, Test: {X_test.shape}")
42
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  data = {
44
  "train": (X_train, y_train),
45
  "val": (X_val, y_val),
@@ -49,6 +136,92 @@ def dataset_splitter(X: np.ndarray, Y: np.ndarray):
49
  return data
50
 
51
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
  def dataset_scaler(audio: np.ndarray, lyrics: np.ndarray):
53
  """
54
  Method to scale both audio and lyric vectors using Z-Score.
@@ -68,7 +241,7 @@ def dataset_scaler(audio: np.ndarray, lyrics: np.ndarray):
68
  -------
69
  scaled_audio : np.array
70
  Array of scaled audio features
71
- scaled_lyrics : np.array
72
  Array of scaled lyric features
73
  """
74
 
@@ -76,14 +249,11 @@ def dataset_scaler(audio: np.ndarray, lyrics: np.ndarray):
76
  audio_scaler = StandardScaler().fit(audio)
77
  lyric_scaler = StandardScaler().fit(lyrics)
78
 
79
- scaled_audio = audio_scaler.transform(audio)
80
- scaled_lyrics = lyric_scaler.transform(lyrics)
81
-
82
  # Save the trained scalers for prediction
83
- joblib.dump(audio_scaler, "models/fusion/audio_scaler.pkl")
84
- joblib.dump(lyric_scaler, "models/fusion/lyric_scaler.pkl")
85
 
86
- return scaled_audio, scaled_lyrics
87
 
88
 
89
  def instance_scaler(audio: np.ndarray, lyrics: np.ndarray):
@@ -101,15 +271,15 @@ def instance_scaler(audio: np.ndarray, lyrics: np.ndarray):
101
  -------
102
  scaled_audio : np.array
103
  Array of scaled audio feature
104
- scaled_lyrics : np.array
105
  Array of scaled lyric feature
106
  """
107
 
108
  # Apply scalers to the single inputs
109
- audio_scaler = joblib.load("models/fusion/audio_scaler.pkl")
110
- lyric_scaler = joblib.load("models/fusion/lyric_scaler.pkl")
111
 
112
  scaled_audio = audio_scaler.transform([audio])
113
- scaled_lyrics = lyric_scaler.transform(lyrics)
114
 
115
- return scaled_audio, scaled_lyrics
 
1
+ from sklearn.preprocessing import StandardScaler, MinMaxScaler
2
  from sklearn.model_selection import train_test_split
3
+ from src.utils.config_loader import AUDIO_SCALER, LYRICS_SCALER, PCA_SCALER
4
+ from sklearn.decomposition import IncrementalPCA
5
+ from src.utils.config_loader import PCA_MODEL
6
 
7
  import joblib
8
  import numpy as np
9
  import logging
10
+ import pandas as pd
11
 
12
  logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
13
  logger = logging.getLogger(__name__)
14
 
15
 
16
+ def dataset_splitter(X: np.ndarray, Y: np.ndarray, ids: np.ndarray = None):
17
  """
18
+ Splits X, Y (and optional ids) into train/val/test sets.
19
+ Saves metadata CSVs for each split if ids are provided.
20
 
21
  Parameters
22
  ----------
23
  X : np.array
24
+ Feature vectors
25
  Y : np.array
26
+ Labels
27
+ ids : np.array, optional
28
+ Identifiers (filenames or row indices)
29
+ save_metadata : bool
30
+ Whether to save split metadata CSVs
31
+ outdir : str
32
+ Directory to save metadata CSVs
33
 
34
  Returns
35
  -------
36
+ data : dict
37
+ A dictionary of np.arrays: {train, val, test}
38
+ Each value is a tuple (X_split, y_split, ids_split if provided)
39
  """
40
 
41
  logger.info(f"Dataset shape: {X.shape}, Labels: {len(Y)}")
42
  logger.info(f"Class distribution: {np.bincount(Y)}")
43
 
44
+ # First split: train vs test
45
  X_train, X_test, y_train, y_test = train_test_split(
46
+ X, Y, ids, test_size=0.1, random_state=42, stratify=Y
47
  )
48
+
49
+ # Second split: train vs val
50
  X_train, X_val, y_train, y_val = train_test_split(
51
  X_train, y_train, test_size=0.2222, random_state=42, stratify=y_train
52
  )
53
+
54
  logger.info(f"Train: {X_train.shape}, Validation: {X_val.shape}, Test: {X_test.shape}")
55
 
56
+ data = {
57
+ "train": (X_train, y_train),
58
+ "val": (X_val, y_val),
59
+ "test": (X_test, y_test),
60
+ }
61
+
62
+ return data
63
+
64
+
65
+ def scale_pca(data : dict):
66
+ """
67
+ Script that scales the splits, and applies PCA to the lyrics vector.
68
+
69
+ Parameters
70
+ ----------
71
+ data : dictionary
72
+ Dictionary containing the splits
73
+
74
+ Returns
75
+ -------
76
+ data : dict{np.array}
77
+ A dictionary of np.arrays, containing the train/test/val split.
78
+ """
79
+
80
+ # Destructure the dictionary to get data split
81
+ X_train, y_train = data["train"]
82
+ X_val, y_val = data["val"]
83
+ X_test, y_test = data["test"]
84
+
85
+ # Segment the concatenated embedding to audio and lyrics
86
+ X_train_audio, X_train_lyrics = X_train[:, :384], X_train[:, 384:]
87
+ X_test_audio, X_test_lyrics = X_test[:, :384], X_test[:, 384:]
88
+ X_val_audio, X_val_lyrics = X_val[:, :384], X_val[:, 384:]
89
+
90
+ # Fit the scalers into the train data, return scalers for fitting of test and validation
91
+ audio_scaler, lyric_scaler = dataset_scaler(X_train_audio, X_train_lyrics)
92
+
93
+ # Transform the rest of the splits using the scalers
94
+ X_train_audio = audio_scaler.transform(X_train_audio)
95
+ X_test_audio = audio_scaler.transform(X_test_audio)
96
+ X_val_audio = audio_scaler.transform(X_val_audio)
97
+
98
+ X_train_lyrics = lyric_scaler.transform(X_train_lyrics)
99
+ X_test_lyrics = lyric_scaler.transform(X_test_lyrics)
100
+ X_val_lyrics = lyric_scaler.transform(X_val_lyrics)
101
+
102
+ # Fit PCA on TRAINING lyrics only
103
+ ipca = IncrementalPCA(n_components=512)
104
+ batch_size = 1000
105
+
106
+ for i in range(0, X_train_lyrics.shape[0], batch_size):
107
+ ipca.partial_fit(X_train_lyrics[i:i + batch_size])
108
+
109
+ # Transform in batches
110
+ X_train_lyrics = ipca.transform(X_train_lyrics)
111
+ X_test_lyrics = ipca.transform(X_test_lyrics)
112
+ X_val_lyrics = ipca.transform(X_val_lyrics)
113
+
114
+ # Apply scaler to the PCA output
115
+ pca_lyric_scaler = StandardScaler().fit(X_train_lyrics)
116
+
117
+ X_train_lyrics = pca_lyric_scaler.transform(X_train_lyrics)
118
+ X_test_lyrics = pca_lyric_scaler.transform(X_test_lyrics)
119
+ X_val_lyrics = pca_lyric_scaler.transform(X_val_lyrics)
120
+
121
+ # Concatenate them back to their original form, but scaled
122
+ X_train = np.concatenate([X_train_audio, X_train_lyrics], axis=1)
123
+ X_test = np.concatenate([X_test_audio, X_test_lyrics], axis=1)
124
+ X_val = np.concatenate([X_val_audio, X_val_lyrics], axis=1)
125
+
126
+ joblib.dump(ipca, PCA_MODEL)
127
+ # Save the trained scalers for prediction
128
+ joblib.dump(pca_lyric_scaler, PCA_SCALER)
129
+
130
  data = {
131
  "train": (X_train, y_train),
132
  "val": (X_val, y_val),
 
136
  return data
137
 
138
 
139
+ def scale_pca_lyrics(data : dict):
140
+ """
141
+ Script that scales the splits, and applies PCA to the lyrics vector.
142
+
143
+ Parameters
144
+ ----------
145
+ data : dictionary
146
+ Dictionary containing the splits
147
+
148
+ Returns
149
+ -------
150
+ data : dict{np.array}
151
+ A dictionary of np.arrays, containing the train/test/val split.
152
+ """
153
+
154
+ # Destructure the dictionary to get data split
155
+ X_train, y_train = data["train"]
156
+ X_val, y_val = data["val"]
157
+ X_test, y_test = data["test"]
158
+
159
+ lyric_scaler = StandardScaler().fit(X_train)
160
+ joblib.dump(lyric_scaler, LYRICS_SCALER)
161
+
162
+ X_train = lyric_scaler.transform(X_train)
163
+ X_test = lyric_scaler.transform(X_test)
164
+ X_val = lyric_scaler.transform(X_val)
165
+
166
+ # Fit PCA on TRAINING lyrics only
167
+ ipca = IncrementalPCA(n_components=512)
168
+ batch_size = 1000
169
+
170
+ for i in range(0, X_train.shape[0], batch_size):
171
+ ipca.partial_fit(X_train[i:i + batch_size])
172
+
173
+ # Transform in batches
174
+ X_train = ipca.transform(X_train)
175
+ X_test = ipca.transform(X_test)
176
+ X_val = ipca.transform(X_val)
177
+
178
+ joblib.dump(ipca, PCA_MODEL)
179
+
180
+ data = {
181
+ "train": (X_train, y_train),
182
+ "val": (X_val, y_val),
183
+ "test": (X_test, y_test),
184
+ }
185
+
186
+ return data
187
+
188
+
189
+ def scale(data : dict):
190
+ """
191
+ Script that scales the splits, and applies PCA to the lyrics vector.
192
+
193
+ Parameters
194
+ ----------
195
+ data : dictionary
196
+ Dictionary containing the splits
197
+
198
+ Returns
199
+ -------
200
+ data : dict{np.array}
201
+ A dictionary of np.arrays, containing the train/test/val split.
202
+ """
203
+
204
+ # Destructure the dictionary to get data split
205
+ X_train, y_train = data["train"]
206
+ X_val, y_val = data["val"]
207
+ X_test, y_test = data["test"]
208
+
209
+ audio_scaler = StandardScaler(with_mean=False).fit(X_train)
210
+ joblib.dump(audio_scaler, AUDIO_SCALER)
211
+
212
+ # Transform the rest of the splits using the scalers
213
+ X_train = audio_scaler.transform(X_train)
214
+ X_test = audio_scaler.transform(X_test)
215
+ X_val = audio_scaler.transform(X_val)
216
+
217
+ data = {
218
+ "train": (X_train, y_train),
219
+ "val": (X_val, y_val),
220
+ "test": (X_test, y_test),
221
+ }
222
+
223
+ return data
224
+
225
  def dataset_scaler(audio: np.ndarray, lyrics: np.ndarray):
226
  """
227
  Method to scale both audio and lyric vectors using Z-Score.
 
241
  -------
242
  scaled_audio : np.array
243
  Array of scaled audio features
244
+ scaleds : np.array
245
  Array of scaled lyric features
246
  """
247
 
 
249
  audio_scaler = StandardScaler().fit(audio)
250
  lyric_scaler = StandardScaler().fit(lyrics)
251
 
 
 
 
252
  # Save the trained scalers for prediction
253
+ joblib.dump(audio_scaler, AUDIO_SCALER)
254
+ joblib.dump(lyric_scaler, LYRICS_SCALER)
255
 
256
+ return audio_scaler, lyric_scaler
257
 
258
 
259
  def instance_scaler(audio: np.ndarray, lyrics: np.ndarray):
 
271
  -------
272
  scaled_audio : np.array
273
  Array of scaled audio feature
274
+ scaleds : np.array
275
  Array of scaled lyric feature
276
  """
277
 
278
  # Apply scalers to the single inputs
279
+ audio_scaler = joblib.load(AUDIO_SCALER)
280
+ lyric_scaler = joblib.load(LYRICS_SCALER)
281
 
282
  scaled_audio = audio_scaler.transform([audio])
283
+ scaled_lyric = lyric_scaler.transform(lyrics)
284
 
285
+ return scaled_audio, scaled_lyric