Spaces:
Running
Running
Auto-deploy from GitHub: 5ac21603a8274a2350875ec7db1bd58cbf2ee539
Browse files- config/data_config.yml +7 -3
- config/model_config.yml +5 -5
- scripts/explain_test.py +1 -1
- scripts/predict.py +38 -27
- scripts/train.py +111 -66
- src/llm2vectrain/llm2vec_trainer.py +3 -107
- src/llm2vectrain/model.py +1 -0
- src/models/mlp.py +12 -6
- src/musiclime/factorization.py +14 -1
- src/musiclime/wrapper.py +25 -27
- src/preprocessing/audio_preprocessor.py +41 -10
- src/preprocessing/preprocessor.py +70 -12
- src/spectttra/spectttra.py +99 -1
- src/spectttra/spectttra_trainer.py +43 -74
- src/utils/config_loader.py +5 -1
- src/utils/dataset.py +193 -23
config/data_config.yml
CHANGED
|
@@ -1,8 +1,12 @@
|
|
| 1 |
base_dir: "."
|
| 2 |
|
| 3 |
paths:
|
| 4 |
-
dataset_npz: "data/processed/
|
| 5 |
-
dataset_csv: "data/external/
|
| 6 |
raw_dir: "data/raw"
|
| 7 |
processed_dir: "data/processed"
|
| 8 |
-
pca_path: "
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
base_dir: "."
|
| 2 |
|
| 3 |
paths:
|
| 4 |
+
dataset_npz: "data/processed/training_data_40k.npz"
|
| 5 |
+
dataset_csv: "data/external/dataset_40000.csv"
|
| 6 |
raw_dir: "data/raw"
|
| 7 |
processed_dir: "data/processed"
|
| 8 |
+
pca_path: "models/fusion/pca.pkl"
|
| 9 |
+
lyrics_scaler: "models/fusion/lyrics_scaler.pkl"
|
| 10 |
+
pca_scaler: "models/fusion/pca_scaler.pkl"
|
| 11 |
+
audio_scaler: "models/fusion/audio_scaler.pkl"
|
| 12 |
+
raw_dataset_npz: "data/processed/raw_training_data_40k.npz"
|
config/model_config.yml
CHANGED
|
@@ -1,11 +1,11 @@
|
|
| 1 |
mlp:
|
| 2 |
-
hidden_layers: [
|
| 3 |
-
dropout: [0.
|
| 4 |
learning_rate: 0.0001 # Adam optimizer
|
| 5 |
batch_size: 128 # Number of samples processed together
|
| 6 |
epochs: 200 # Maximum training iterations
|
| 7 |
-
patience:
|
| 8 |
|
| 9 |
-
weight_decay: 0.
|
| 10 |
gradient_clipping: 0.5 # Prevent exploding gradients
|
| 11 |
-
mixup_alpha: 0.
|
|
|
|
| 1 |
mlp:
|
| 2 |
+
hidden_layers: [512, 256, 128] # 3 hidden layers
|
| 3 |
+
dropout: [0.5, 0.4, 0.3] # Dropout rates for each layer
|
| 4 |
learning_rate: 0.0001 # Adam optimizer
|
| 5 |
batch_size: 128 # Number of samples processed together
|
| 6 |
epochs: 200 # Maximum training iterations
|
| 7 |
+
patience: 15 # Early stopping patience
|
| 8 |
|
| 9 |
+
weight_decay: 0.01 # L2 regularization
|
| 10 |
gradient_clipping: 0.5 # Prevent exploding gradients
|
| 11 |
+
mixup_alpha: 0.1 # For data augmentation during training, 0 disables MixUp
|
scripts/explain_test.py
CHANGED
|
@@ -34,7 +34,7 @@ def explain():
|
|
| 34 |
audio=y,
|
| 35 |
lyrics=lyrics_text,
|
| 36 |
predict_fn=predictor,
|
| 37 |
-
num_samples=
|
| 38 |
labels=(1,),
|
| 39 |
)
|
| 40 |
|
|
|
|
| 34 |
audio=y,
|
| 35 |
lyrics=lyrics_text,
|
| 36 |
predict_fn=predictor,
|
| 37 |
+
num_samples=5,
|
| 38 |
labels=(1,),
|
| 39 |
)
|
| 40 |
|
scripts/predict.py
CHANGED
|
@@ -3,16 +3,13 @@ from src.spectttra.spectttra_trainer import spectttra_predict
|
|
| 3 |
from src.llm2vectrain.model import load_llm2vec_model
|
| 4 |
from src.llm2vectrain.llm2vec_trainer import l2vec_single_train, load_pca_model
|
| 5 |
from src.models.mlp import build_mlp, load_config
|
| 6 |
-
from pathlib import Path
|
| 7 |
-
from src.utils.config_loader import DATASET_NPZ
|
| 8 |
from src.utils.dataset import instance_scaler
|
| 9 |
|
| 10 |
-
from pathlib import Path
|
| 11 |
import numpy as np
|
| 12 |
-
import
|
| 13 |
|
| 14 |
|
| 15 |
-
def predict_pipeline(
|
| 16 |
"""
|
| 17 |
Predict script which includes preprocessing, feature extraction, and
|
| 18 |
training the MLP model for a single data sample.
|
|
@@ -34,49 +31,63 @@ def predict_pipeline(audio, lyrics: str):
|
|
| 34 |
A numerical representation of the prediction
|
| 35 |
"""
|
| 36 |
|
| 37 |
-
# Instantiate
|
| 38 |
-
X, Y = None, None
|
| 39 |
-
|
| 40 |
-
# Instantiate LLM2Vec Model
|
| 41 |
llm2vec_model = load_llm2vec_model()
|
| 42 |
|
| 43 |
-
# Preprocess both audio and lyrics
|
| 44 |
-
audio, lyrics = single_preprocessing(
|
| 45 |
|
| 46 |
-
# Call the train method for both models
|
| 47 |
audio_features = spectttra_predict(audio)
|
| 48 |
lyrics_features = l2vec_single_train(llm2vec_model, lyrics)
|
| 49 |
|
| 50 |
-
#
|
|
|
|
|
|
|
|
|
|
| 51 |
reduced_lyrics = load_pca_model(lyrics_features)
|
| 52 |
|
| 53 |
-
# Scale the vectors using Z-Score
|
| 54 |
audio_features, reduced_lyrics = instance_scaler(audio_features, reduced_lyrics)
|
| 55 |
|
| 56 |
-
# Concatenate the vectors of audio_features + lyrics_features
|
| 57 |
results = np.concatenate([audio_features, reduced_lyrics], axis=1)
|
| 58 |
|
| 59 |
# ---- Load MLP Classifier ----
|
| 60 |
config = load_config("config/model_config.yml")
|
| 61 |
classifier = build_mlp(input_dim=results.shape[1], config=config)
|
| 62 |
|
| 63 |
-
# Load trained weights (make sure this path matches where you saved your model)
|
| 64 |
-
model_path = "models/mlp/
|
| 65 |
classifier.load_model(model_path)
|
| 66 |
classifier.model.eval()
|
| 67 |
|
| 68 |
-
# Run prediction
|
| 69 |
-
probability, prediction, label = classifier.predict_single(results)
|
| 70 |
|
| 71 |
-
return {
|
| 72 |
-
"probability": probability,
|
| 73 |
-
"label": label,
|
| 74 |
-
"prediction": "AI-Generated" if prediction == 0 else "Human-Composed",
|
| 75 |
-
}
|
| 76 |
|
| 77 |
|
| 78 |
if __name__ == "__main__":
|
| 79 |
# Example usage (replace with real inputs, place song inside data/raw.)
|
| 80 |
-
|
| 81 |
-
|
| 82 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 3 |
from src.llm2vectrain.model import load_llm2vec_model
|
| 4 |
from src.llm2vectrain.llm2vec_trainer import l2vec_single_train, load_pca_model
|
| 5 |
from src.models.mlp import build_mlp, load_config
|
|
|
|
|
|
|
| 6 |
from src.utils.dataset import instance_scaler
|
| 7 |
|
|
|
|
| 8 |
import numpy as np
|
| 9 |
+
import pandas as pd
|
| 10 |
|
| 11 |
|
| 12 |
+
def predict_pipeline(audio_file, lyrics):
|
| 13 |
"""
|
| 14 |
Predict script which includes preprocessing, feature extraction, and
|
| 15 |
training the MLP model for a single data sample.
|
|
|
|
| 31 |
A numerical representation of the prediction
|
| 32 |
"""
|
| 33 |
|
| 34 |
+
# 1.) Instantiate LLM2Vec Model
|
|
|
|
|
|
|
|
|
|
| 35 |
llm2vec_model = load_llm2vec_model()
|
| 36 |
|
| 37 |
+
# 2.) Preprocess both audio and lyrics
|
| 38 |
+
audio, lyrics = single_preprocessing(audio_file, lyrics)
|
| 39 |
|
| 40 |
+
# 3.) Call the train method for both models
|
| 41 |
audio_features = spectttra_predict(audio)
|
| 42 |
lyrics_features = l2vec_single_train(llm2vec_model, lyrics)
|
| 43 |
|
| 44 |
+
# 4.) Scale the vectors using Z-Score
|
| 45 |
+
audio_features, lyrics_features = instance_scaler(audio_features, lyrics_features)
|
| 46 |
+
|
| 47 |
+
# 5.) Reduce the lyrics using saved PCA model
|
| 48 |
reduced_lyrics = load_pca_model(lyrics_features)
|
| 49 |
|
| 50 |
+
# Scale the vectors using Z-Score again
|
| 51 |
audio_features, reduced_lyrics = instance_scaler(audio_features, reduced_lyrics)
|
| 52 |
|
| 53 |
+
# 6.) Concatenate the vectors of audio_features + lyrics_features
|
| 54 |
results = np.concatenate([audio_features, reduced_lyrics], axis=1)
|
| 55 |
|
| 56 |
# ---- Load MLP Classifier ----
|
| 57 |
config = load_config("config/model_config.yml")
|
| 58 |
classifier = build_mlp(input_dim=results.shape[1], config=config)
|
| 59 |
|
| 60 |
+
# 7.) Load trained weights (make sure this path matches where you saved your model)
|
| 61 |
+
model_path = "models/mlp/mlp_best.pth"
|
| 62 |
classifier.load_model(model_path)
|
| 63 |
classifier.model.eval()
|
| 64 |
|
| 65 |
+
# 8.) Run prediction
|
| 66 |
+
probability, prediction, label = classifier.predict_single(results.flatten())
|
| 67 |
|
| 68 |
+
return {"probability": probability, "prediction": prediction, "label": label}
|
|
|
|
|
|
|
|
|
|
|
|
|
| 69 |
|
| 70 |
|
| 71 |
if __name__ == "__main__":
|
| 72 |
# Example usage (replace with real inputs, place song inside data/raw.)
|
| 73 |
+
data = pd.read_csv("data/raw/predict_data_final.csv")
|
| 74 |
+
|
| 75 |
+
result = []
|
| 76 |
+
label = []
|
| 77 |
+
for row in data.itertuples():
|
| 78 |
+
prediction = predict_pipeline(row.song, row.lyrics)
|
| 79 |
+
result.append(
|
| 80 |
+
{
|
| 81 |
+
"song": row.song,
|
| 82 |
+
"label": row.label,
|
| 83 |
+
"predicted_label": prediction["label"],
|
| 84 |
+
"probability": prediction["probability"],
|
| 85 |
+
}
|
| 86 |
+
)
|
| 87 |
+
|
| 88 |
+
for r in result:
|
| 89 |
+
print(f"Song: {r['song']}")
|
| 90 |
+
print(f"Actual Label: {r['label']}")
|
| 91 |
+
print(f"Predicted: {r['predicted_label']}")
|
| 92 |
+
print(f"Confidence: {r['probability']: .8f}%")
|
| 93 |
+
print("-" * 50)
|
scripts/train.py
CHANGED
|
@@ -4,14 +4,14 @@ from src.llm2vectrain.model import load_llm2vec_model
|
|
| 4 |
from src.llm2vectrain.llm2vec_trainer import l2vec_train
|
| 5 |
from src.models.mlp import build_mlp, load_config
|
| 6 |
|
| 7 |
-
from src.utils.config_loader import DATASET_NPZ
|
| 8 |
-
from src.utils.dataset import dataset_scaler, dataset_splitter
|
| 9 |
-
from sklearn.decomposition import PCA
|
| 10 |
|
| 11 |
from pathlib import Path
|
|
|
|
|
|
|
|
|
|
| 12 |
import numpy as np
|
| 13 |
import logging
|
| 14 |
-
import joblib
|
| 15 |
|
| 16 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
| 17 |
logger = logging.getLogger(__name__)
|
|
@@ -21,6 +21,10 @@ def train_mlp_model(data : dict):
|
|
| 21 |
"""
|
| 22 |
Train the MLP model with extracted features.
|
| 23 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
Parameters
|
| 25 |
----------
|
| 26 |
data : dict{np.array}
|
|
@@ -31,6 +35,11 @@ def train_mlp_model(data : dict):
|
|
| 31 |
# Load MLP configuration
|
| 32 |
config = load_config("config/model_config.yml")
|
| 33 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 34 |
# Destructure the dictionary to get data split
|
| 35 |
X_train, y_train = data["train"]
|
| 36 |
X_val, y_val = data["val"]
|
|
@@ -47,6 +56,7 @@ def train_mlp_model(data : dict):
|
|
| 47 |
|
| 48 |
# Load best model and evaluate on test set
|
| 49 |
try:
|
|
|
|
| 50 |
mlp_classifier.load_model("models/mlp/mlp_best.pth")
|
| 51 |
logger.info("Loaded best model for final evaluation")
|
| 52 |
except FileNotFoundError:
|
|
@@ -55,8 +65,10 @@ def train_mlp_model(data : dict):
|
|
| 55 |
# Final evaluation
|
| 56 |
test_results = mlp_classifier.evaluate(X_test, y_test)
|
| 57 |
|
|
|
|
| 58 |
# Save final model
|
| 59 |
mlp_classifier.save_model("models/mlp/mlp_multimodal.pth")
|
|
|
|
| 60 |
|
| 61 |
logger.info("MLP training completed successfully!")
|
| 62 |
logger.info(f"Final test accuracy: {test_results['test_accuracy']:.2f}%")
|
|
@@ -64,6 +76,7 @@ def train_mlp_model(data : dict):
|
|
| 64 |
return mlp_classifier
|
| 65 |
|
| 66 |
|
|
|
|
| 67 |
def train_pipeline():
|
| 68 |
"""
|
| 69 |
Training script which includes preprocessing, feature extraction, and training the MLP model.
|
|
@@ -79,82 +92,114 @@ def train_pipeline():
|
|
| 79 |
None
|
| 80 |
"""
|
| 81 |
|
| 82 |
-
#
|
| 83 |
-
|
|
|
|
|
|
|
| 84 |
|
| 85 |
-
dataset_path = Path(
|
| 86 |
|
| 87 |
if dataset_path.exists():
|
| 88 |
logger.info("Training dataset already exists. Loading file...")
|
| 89 |
|
| 90 |
-
loaded_data = np.load(
|
| 91 |
-
|
| 92 |
-
|
|
|
|
|
|
|
|
|
|
| 93 |
else:
|
|
|
|
| 94 |
logger.info("Training dataset does not exist. Processing data...")
|
| 95 |
# Get batches from dataset and return full Y labels
|
| 96 |
-
|
| 97 |
batch_count = 1
|
| 98 |
|
| 99 |
-
# Instantiate LLM2Vec
|
| 100 |
-
|
| 101 |
-
|
| 102 |
-
# Preallocate
|
| 103 |
-
|
| 104 |
-
|
| 105 |
-
|
| 106 |
-
|
| 107 |
-
|
| 108 |
-
|
| 109 |
-
|
| 110 |
-
|
| 111 |
-
|
| 112 |
-
|
| 113 |
-
|
| 114 |
-
|
| 115 |
-
|
| 116 |
-
|
| 117 |
-
|
| 118 |
-
|
| 119 |
-
|
| 120 |
-
|
| 121 |
-
|
| 122 |
-
|
| 123 |
-
|
| 124 |
-
|
| 125 |
-
|
| 126 |
-
|
| 127 |
-
|
| 128 |
-
|
| 129 |
-
|
| 130 |
-
|
| 131 |
-
|
| 132 |
-
|
| 133 |
-
|
| 134 |
-
|
| 135 |
-
|
| 136 |
-
|
| 137 |
-
|
| 138 |
-
|
| 139 |
-
|
| 140 |
-
|
| 141 |
-
|
| 142 |
-
|
| 143 |
-
|
| 144 |
-
|
| 145 |
-
|
| 146 |
-
|
| 147 |
-
|
| 148 |
-
# Save
|
| 149 |
-
logger.info("Saving dataset
|
| 150 |
-
np.savez(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 151 |
|
| 152 |
-
|
| 153 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
|
| 155 |
logger.info("Starting MLP training...")
|
| 156 |
train_mlp_model(data)
|
| 157 |
|
| 158 |
-
|
| 159 |
if __name__ == "__main__":
|
| 160 |
train_pipeline()
|
|
|
|
| 4 |
from src.llm2vectrain.llm2vec_trainer import l2vec_train
|
| 5 |
from src.models.mlp import build_mlp, load_config
|
| 6 |
|
| 7 |
+
from src.utils.config_loader import DATASET_NPZ
|
|
|
|
|
|
|
| 8 |
|
| 9 |
from pathlib import Path
|
| 10 |
+
from src.utils.config_loader import DATASET_NPZ, RAW_DATASET_NPZ
|
| 11 |
+
from src.utils.dataset import scale_pca
|
| 12 |
+
|
| 13 |
import numpy as np
|
| 14 |
import logging
|
|
|
|
| 15 |
|
| 16 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
| 17 |
logger = logging.getLogger(__name__)
|
|
|
|
| 21 |
"""
|
| 22 |
Train the MLP model with extracted features.
|
| 23 |
|
| 24 |
+
Parameters
|
| 25 |
+
----------
|
| 26 |
+
data : dict{np.array}
|
| 27 |
+
A dictionary of np.arrays, containing the train/test/val split.
|
| 28 |
Parameters
|
| 29 |
----------
|
| 30 |
data : dict{np.array}
|
|
|
|
| 35 |
# Load MLP configuration
|
| 36 |
config = load_config("config/model_config.yml")
|
| 37 |
|
| 38 |
+
# Destructure the dictionary to get data split
|
| 39 |
+
X_train, y_train = data["train"]
|
| 40 |
+
X_val, y_val = data["val"]
|
| 41 |
+
X_test, y_test = data["test"]
|
| 42 |
+
|
| 43 |
# Destructure the dictionary to get data split
|
| 44 |
X_train, y_train = data["train"]
|
| 45 |
X_val, y_val = data["val"]
|
|
|
|
| 56 |
|
| 57 |
# Load best model and evaluate on test set
|
| 58 |
try:
|
| 59 |
+
mlp_classifier.load_model("models/mlp/mlp_best.pth")
|
| 60 |
mlp_classifier.load_model("models/mlp/mlp_best.pth")
|
| 61 |
logger.info("Loaded best model for final evaluation")
|
| 62 |
except FileNotFoundError:
|
|
|
|
| 65 |
# Final evaluation
|
| 66 |
test_results = mlp_classifier.evaluate(X_test, y_test)
|
| 67 |
|
| 68 |
+
|
| 69 |
# Save final model
|
| 70 |
mlp_classifier.save_model("models/mlp/mlp_multimodal.pth")
|
| 71 |
+
mlp_classifier.save_model("models/mlp/mlp_multimodal.pth")
|
| 72 |
|
| 73 |
logger.info("MLP training completed successfully!")
|
| 74 |
logger.info(f"Final test accuracy: {test_results['test_accuracy']:.2f}%")
|
|
|
|
| 76 |
return mlp_classifier
|
| 77 |
|
| 78 |
|
| 79 |
+
|
| 80 |
def train_pipeline():
|
| 81 |
"""
|
| 82 |
Training script which includes preprocessing, feature extraction, and training the MLP model.
|
|
|
|
| 92 |
None
|
| 93 |
"""
|
| 94 |
|
| 95 |
+
# Set constant sizes
|
| 96 |
+
BATCH_SIZE = 200
|
| 97 |
+
AUDIO_SIZE = 384
|
| 98 |
+
LYRIC_SIZE = 2048
|
| 99 |
|
| 100 |
+
dataset_path = Path(RAW_DATASET_NPZ)
|
| 101 |
|
| 102 |
if dataset_path.exists():
|
| 103 |
logger.info("Training dataset already exists. Loading file...")
|
| 104 |
|
| 105 |
+
loaded_data = np.load(RAW_DATASET_NPZ)
|
| 106 |
+
data = {
|
| 107 |
+
"train": (loaded_data["X_train"], loaded_data["y_train"]),
|
| 108 |
+
"test": (loaded_data["X_test"], loaded_data["y_test"]),
|
| 109 |
+
"val": (loaded_data["X_val"], loaded_data["y_val"]),
|
| 110 |
+
}
|
| 111 |
else:
|
| 112 |
+
logger.info("Training dataset does not exist. Processing data...")
|
| 113 |
logger.info("Training dataset does not exist. Processing data...")
|
| 114 |
# Get batches from dataset and return full Y labels
|
| 115 |
+
splits, split_lengths = dataset_read(batch_size=BATCH_SIZE)
|
| 116 |
batch_count = 1
|
| 117 |
|
| 118 |
+
# Instantiate LLM2Vec Model
|
| 119 |
+
l2v = load_llm2vec_model()
|
| 120 |
+
|
| 121 |
+
# Preallocate arrays
|
| 122 |
+
X_train = np.zeros((split_lengths[0], AUDIO_SIZE + LYRIC_SIZE), dtype=np.float32)
|
| 123 |
+
X_test = np.zeros((split_lengths[1], AUDIO_SIZE + LYRIC_SIZE), dtype=np.float32)
|
| 124 |
+
X_val = np.zeros((split_lengths[2], AUDIO_SIZE + LYRIC_SIZE), dtype=np.float32)
|
| 125 |
+
|
| 126 |
+
y_train = np.zeros(split_lengths[0], dtype=np.int32)
|
| 127 |
+
y_test = np.zeros(split_lengths[1], dtype=np.int32)
|
| 128 |
+
y_val = np.zeros(split_lengths[2], dtype=np.int32)
|
| 129 |
+
|
| 130 |
+
X_splits = [X_train, X_test, X_val]
|
| 131 |
+
y_splits = [y_train, y_test, y_val]
|
| 132 |
+
|
| 133 |
+
# Loop through the three splits
|
| 134 |
+
for split_idx, split in enumerate(splits):
|
| 135 |
+
start_idx = 0
|
| 136 |
+
|
| 137 |
+
# Loop through batches for each split
|
| 138 |
+
for batch in split:
|
| 139 |
+
if len(batch) == 0:
|
| 140 |
+
continue # skip empty batch safely
|
| 141 |
+
|
| 142 |
+
logger.info(f"Bulk Preprocessing batch {batch_count}...")
|
| 143 |
+
audio, lyrics = bulk_preprocessing(batch, batch_count)
|
| 144 |
+
batch_labels = batch['target'].values
|
| 145 |
+
|
| 146 |
+
# Extract audio features
|
| 147 |
+
logger.info("Starting SpecTTTra feature extraction...")
|
| 148 |
+
audio_features = spectttra_train(audio)
|
| 149 |
+
|
| 150 |
+
# Call the train method for LLM2Vec
|
| 151 |
+
logger.info(f"\nStarting LLM2Vec feature extraction...")
|
| 152 |
+
lyric_features = l2vec_train(l2v, lyrics)
|
| 153 |
+
|
| 154 |
+
# Concatenate the two features
|
| 155 |
+
batch_feature = np.concatenate([audio_features, lyric_features], axis=1)
|
| 156 |
+
|
| 157 |
+
# Allocate them to the preallocated blocks
|
| 158 |
+
bsz = batch_feature.shape[0]
|
| 159 |
+
X_splits[split_idx][start_idx:start_idx + bsz, :] = batch_feature
|
| 160 |
+
y_splits[split_idx][start_idx:start_idx + bsz] = batch_labels
|
| 161 |
+
|
| 162 |
+
logger.info(f"Batch {batch_count}: {bsz} samples, start_idx={start_idx}")
|
| 163 |
+
|
| 164 |
+
batch_count += 1
|
| 165 |
+
start_idx += bsz
|
| 166 |
+
|
| 167 |
+
# Save raw (unscaled) dataset
|
| 168 |
+
logger.info("Saving raw dataset...")
|
| 169 |
+
np.savez(
|
| 170 |
+
RAW_DATASET_NPZ,
|
| 171 |
+
X_train=X_train, y_train=y_train,
|
| 172 |
+
X_val=X_val, y_val=y_val,
|
| 173 |
+
X_test=X_test, y_test=y_test,
|
| 174 |
+
)
|
| 175 |
+
|
| 176 |
+
# Run scaling
|
| 177 |
+
logger.info("Running standard scaling...")
|
| 178 |
+
data = {
|
| 179 |
+
"train": (X_train, y_train),
|
| 180 |
+
"val": (X_val, y_val),
|
| 181 |
+
"test": (X_test, y_test),
|
| 182 |
+
}
|
| 183 |
+
|
| 184 |
+
# Scale and use PCA fitting for all raw data
|
| 185 |
+
logger.info("Scaling and applying PCA...")
|
| 186 |
+
data = scale_pca(data)
|
| 187 |
+
|
| 188 |
+
# Save scaled dataset
|
| 189 |
+
X_train, y_train = data["train"]
|
| 190 |
+
X_val, y_val = data["val"]
|
| 191 |
+
X_test, y_test = data["test"]
|
| 192 |
|
| 193 |
+
logger.info("Saving scaled dataset...")
|
| 194 |
+
np.savez(
|
| 195 |
+
DATASET_NPZ,
|
| 196 |
+
X_train=X_train, y_train=y_train,
|
| 197 |
+
X_val=X_val, y_val=y_val,
|
| 198 |
+
X_test=X_test, y_test=y_test,
|
| 199 |
+
)
|
| 200 |
|
| 201 |
logger.info("Starting MLP training...")
|
| 202 |
train_mlp_model(data)
|
| 203 |
|
|
|
|
| 204 |
if __name__ == "__main__":
|
| 205 |
train_pipeline()
|
src/llm2vectrain/llm2vec_trainer.py
CHANGED
|
@@ -1,115 +1,11 @@
|
|
| 1 |
-
from sklearn.decomposition import IncrementalPCA
|
| 2 |
-
from sklearn.preprocessing import StandardScaler
|
| 3 |
from pathlib import Path
|
|
|
|
| 4 |
|
| 5 |
-
import numpy as np
|
| 6 |
-
import pickle
|
| 7 |
-
import torch
|
| 8 |
-
import os
|
| 9 |
import joblib
|
| 10 |
-
|
| 11 |
-
# Initialize PCA and StandardScaler globally for training
|
| 12 |
-
_pca_trainer = None
|
| 13 |
-
|
| 14 |
-
class SimplePCATrainer:
|
| 15 |
-
"""
|
| 16 |
-
A simple PCA trainer that uses IncrementalPCA to fit data in batches.
|
| 17 |
-
It saves checkpoints every 5 batches and can save the final model.
|
| 18 |
-
|
| 19 |
-
Args:
|
| 20 |
-
None
|
| 21 |
-
|
| 22 |
-
Returns:
|
| 23 |
-
None
|
| 24 |
-
|
| 25 |
-
Attributes:
|
| 26 |
-
pca: The IncrementalPCA model.
|
| 27 |
-
scaler: StandardScaler for normalizing data.
|
| 28 |
-
fitted: Boolean indicating if the model has been initialized.
|
| 29 |
-
batch_count_pca: Counter for the number of batches processed.
|
| 30 |
-
|
| 31 |
-
Methods:
|
| 32 |
-
process_batch(vectors): Processes a batch of vectors, fits the PCA model incrementally.
|
| 33 |
-
save_final(model_path): Saves the final PCA model to the specified path.
|
| 34 |
-
"""
|
| 35 |
-
|
| 36 |
-
# Initialize the trainer
|
| 37 |
-
def __init__(self):
|
| 38 |
-
self.pca = None
|
| 39 |
-
self.scaler = StandardScaler()
|
| 40 |
-
self.fitted = False
|
| 41 |
-
self.batch_count_pca = 0
|
| 42 |
-
|
| 43 |
-
def _determine_optimal_components(self, vectors):
|
| 44 |
-
"""
|
| 45 |
-
Determine the optimal number of PCA components to retain 95% variance.
|
| 46 |
-
|
| 47 |
-
Args:
|
| 48 |
-
vectors: The input data to analyze.
|
| 49 |
-
Returns:
|
| 50 |
-
n_components: The optimal number of components.
|
| 51 |
-
"""
|
| 52 |
-
temp_pca = IncrementalPCA()
|
| 53 |
-
temp_pca.fit(vectors)
|
| 54 |
-
cumsum_var = np.cumsum(temp_pca.explained_variance_ratio_)
|
| 55 |
-
n_comp_95 = np.argmax(cumsum_var >= 0.95) + 1
|
| 56 |
-
return min(n_comp_95, vectors.shape[1] // 2)
|
| 57 |
-
|
| 58 |
-
def process_batch(self, vectors):
|
| 59 |
-
"""
|
| 60 |
-
Process a batch of vectors, fitting the PCA model incrementally.
|
| 61 |
-
|
| 62 |
-
Args:
|
| 63 |
-
vectors: The input data batch to process.
|
| 64 |
-
Returns:
|
| 65 |
-
reduced_vectors: The PCA-transformed data.
|
| 66 |
-
|
| 67 |
-
Note: This method saves a checkpoint every 5 batches.
|
| 68 |
-
"""
|
| 69 |
-
if not self.fitted:
|
| 70 |
-
# First batch - initialize everything
|
| 71 |
-
n_components = self._determine_optimal_components(vectors)
|
| 72 |
-
self.pca = IncrementalPCA(n_components=n_components, batch_size=1000)
|
| 73 |
-
self.scaler.fit(vectors)
|
| 74 |
-
self.fitted = True
|
| 75 |
-
print(f"Initialized PCA with {n_components} components")
|
| 76 |
-
|
| 77 |
-
# Process batch
|
| 78 |
-
vectors_scaled = self.scaler.transform(vectors)
|
| 79 |
-
self.pca.partial_fit(vectors_scaled)
|
| 80 |
-
reduced_vectors = self.pca.transform(vectors_scaled)
|
| 81 |
-
|
| 82 |
-
self.batch_count_pca += 1
|
| 83 |
-
|
| 84 |
-
# Save checkpoint every 5 batches
|
| 85 |
-
if self.batch_count_pca % 5 == 0:
|
| 86 |
-
os.makedirs("pca_checkpoints", exist_ok=True)
|
| 87 |
-
with open(f"pca_checkpoints/checkpoint_batch_{self.batch_count_pca}.pkl", 'wb') as f:
|
| 88 |
-
pickle.dump({'pca': self.pca, 'scaler': self.scaler}, f)
|
| 89 |
-
print(f"Saved checkpoint at batch {self.batch_count_pca}")
|
| 90 |
-
|
| 91 |
-
print(f"Processed batch {self.batch_count_pca}, shape: {vectors.shape} -> {reduced_vectors.shape}")
|
| 92 |
-
return reduced_vectors
|
| 93 |
-
|
| 94 |
-
def save_final(self, model_path):
|
| 95 |
-
"""
|
| 96 |
-
Save the final PCA model to the specified path.
|
| 97 |
-
|
| 98 |
-
Args:
|
| 99 |
-
model_path: The file path to save the PCA model.
|
| 100 |
-
|
| 101 |
-
Returns:
|
| 102 |
-
None
|
| 103 |
-
|
| 104 |
-
Note: Change the model path as needed in the data_config.yml file.
|
| 105 |
-
"""
|
| 106 |
-
os.makedirs(os.path.dirname(model_path), exist_ok=True)
|
| 107 |
-
with open(model_path, 'wb') as f:
|
| 108 |
-
pickle.dump({'pca': self.pca, 'scaler': self.scaler}, f)
|
| 109 |
-
print(f"Final model saved to {model_path}. Total variance explained: {np.sum(self.pca.explained_variance_ratio_):.4f}")
|
| 110 |
|
| 111 |
## For Single Input
|
| 112 |
-
def load_pca_model(vectors, model_path=
|
| 113 |
"""
|
| 114 |
Load a pre-trained PCA model and transform the input vectors.
|
| 115 |
|
|
|
|
|
|
|
|
|
|
| 1 |
from pathlib import Path
|
| 2 |
+
from src.utils.config_loader import PCA_MODEL
|
| 3 |
|
|
|
|
|
|
|
|
|
|
|
|
|
| 4 |
import joblib
|
| 5 |
+
import torch
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 6 |
|
| 7 |
## For Single Input
|
| 8 |
+
def load_pca_model(vectors, model_path=PCA_MODEL):
|
| 9 |
"""
|
| 10 |
Load a pre-trained PCA model and transform the input vectors.
|
| 11 |
|
src/llm2vectrain/model.py
CHANGED
|
@@ -16,6 +16,7 @@ def load_llm2vec_model():
|
|
| 16 |
tokenizer = AutoTokenizer.from_pretrained(
|
| 17 |
model_id, padding=True, truncation=True, max_length=512, cache_dir=cache_dir
|
| 18 |
)
|
|
|
|
| 19 |
config = AutoConfig.from_pretrained(
|
| 20 |
model_id, trust_remote_code=True, cache_dir=cache_dir
|
| 21 |
)
|
|
|
|
| 16 |
tokenizer = AutoTokenizer.from_pretrained(
|
| 17 |
model_id, padding=True, truncation=True, max_length=512, cache_dir=cache_dir
|
| 18 |
)
|
| 19 |
+
|
| 20 |
config = AutoConfig.from_pretrained(
|
| 21 |
model_id, trust_remote_code=True, cache_dir=cache_dir
|
| 22 |
)
|
src/models/mlp.py
CHANGED
|
@@ -52,6 +52,7 @@ import torch.nn as nn
|
|
| 52 |
import torch.optim as optim
|
| 53 |
import numpy as np
|
| 54 |
import yaml
|
|
|
|
| 55 |
|
| 56 |
logger = logging.getLogger(__name__)
|
| 57 |
|
|
@@ -441,7 +442,7 @@ class MLPClassifier:
|
|
| 441 |
|
| 442 |
return probabilities, predictions
|
| 443 |
|
| 444 |
-
def predict_single(self, features: np.ndarray) -> Tuple[float, int, str]:
|
| 445 |
"""
|
| 446 |
Predict whether a single song is AI-generated or human-composed.
|
| 447 |
|
|
@@ -482,14 +483,19 @@ class MLPClassifier:
|
|
| 482 |
f"Expected features for 1 song, got {features.shape[0]} songs. Use predict_batch() instead."
|
| 483 |
)
|
| 484 |
|
| 485 |
-
|
| 486 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 487 |
|
| 488 |
# Extract single results
|
| 489 |
-
|
| 490 |
-
prediction = int(predictions[0])
|
| 491 |
label = "Human-Composed" if prediction == 1 else "AI-Generated"
|
| 492 |
-
|
|
|
|
| 493 |
return probability, prediction, label
|
| 494 |
|
| 495 |
def predict_batch(self, features: np.ndarray, return_details: bool = False) -> Dict:
|
|
|
|
| 52 |
import torch.optim as optim
|
| 53 |
import numpy as np
|
| 54 |
import yaml
|
| 55 |
+
import torch.nn.functional as F
|
| 56 |
|
| 57 |
logger = logging.getLogger(__name__)
|
| 58 |
|
|
|
|
| 442 |
|
| 443 |
return probabilities, predictions
|
| 444 |
|
| 445 |
+
def predict_single(self, features: np.ndarray, temperature: float = 2.5) -> Tuple[float, int, str]:
|
| 446 |
"""
|
| 447 |
Predict whether a single song is AI-generated or human-composed.
|
| 448 |
|
|
|
|
| 483 |
f"Expected features for 1 song, got {features.shape[0]} songs. Use predict_batch() instead."
|
| 484 |
)
|
| 485 |
|
| 486 |
+
self.model.eval()
|
| 487 |
+
with torch.no_grad():
|
| 488 |
+
features_tensor = torch.FloatTensor(features).to(self.device)
|
| 489 |
+
outputs = self.model(features_tensor)
|
| 490 |
+
logit = torch.logit(outputs.clamp(1e-6, 1 - 1e-6))
|
| 491 |
+
probabilities = torch.sigmoid(logit / temperature).item()
|
| 492 |
+
probabilities = np.clip(probabilities, 0.01, 0.99)
|
| 493 |
|
| 494 |
# Extract single results
|
| 495 |
+
prediction = int(probabilities >= 0.5)
|
|
|
|
| 496 |
label = "Human-Composed" if prediction == 1 else "AI-Generated"
|
| 497 |
+
probability = probabilities*100 if prediction == 1 else (1 - probabilities)*100
|
| 498 |
+
|
| 499 |
return probability, prediction, label
|
| 500 |
|
| 501 |
def predict_batch(self, features: np.ndarray, return_details: bool = False) -> Dict:
|
src/musiclime/factorization.py
CHANGED
|
@@ -61,7 +61,20 @@ class OpenUnmixFactorization:
|
|
| 61 |
|
| 62 |
def _separate_sources(self):
|
| 63 |
waveform = np.expand_dims(self.audio, axis=1)
|
| 64 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
|
| 66 |
components = [prediction[key][0].mean(dim=0).numpy() for key in prediction]
|
| 67 |
names = list(prediction.keys())
|
|
|
|
| 61 |
|
| 62 |
def _separate_sources(self):
|
| 63 |
waveform = np.expand_dims(self.audio, axis=1)
|
| 64 |
+
|
| 65 |
+
# Load openunmix .pth files from local dir
|
| 66 |
+
model_path = "models/musiclime"
|
| 67 |
+
|
| 68 |
+
# Specify targets
|
| 69 |
+
targets = ["vocals", "bass", "drums", "other"]
|
| 70 |
+
|
| 71 |
+
# Then load openunmix files to openunmix' method
|
| 72 |
+
prediction = predict.separate(
|
| 73 |
+
torch.as_tensor(waveform).float(),
|
| 74 |
+
rate=44100,
|
| 75 |
+
model_str_or_path=model_path,
|
| 76 |
+
targets=targets,
|
| 77 |
+
)
|
| 78 |
|
| 79 |
components = [prediction[key][0].mean(dim=0).numpy() for key in prediction]
|
| 80 |
names = list(prediction.keys())
|
src/musiclime/wrapper.py
CHANGED
|
@@ -38,8 +38,6 @@ class MusicLIMEPredictor:
|
|
| 38 |
processed_lyrics = []
|
| 39 |
|
| 40 |
for i, (text, audio) in enumerate(zip(texts, audios)):
|
| 41 |
-
# if i % 100 == 0:
|
| 42 |
-
# print(f" Preprocessing {i+1}/{len(texts)}")
|
| 43 |
processed_audio, processed_lyric = single_preprocessing(audio, text)
|
| 44 |
processed_audios.append(processed_audio)
|
| 45 |
processed_lyrics.append(processed_lyric)
|
|
@@ -74,44 +72,49 @@ class MusicLIMEPredictor:
|
|
| 74 |
)
|
| 75 |
)
|
| 76 |
|
| 77 |
-
# Step 3:
|
| 78 |
start_time = time.time()
|
| 79 |
-
print("[MusicLIME]
|
| 80 |
-
pca_model = joblib.load("models/fusion/pca.pkl")
|
| 81 |
-
reduced_lyrics_batch = pca_model.transform(
|
| 82 |
-
lyrics_features_batch
|
| 83 |
-
) # (batch, 256)
|
| 84 |
-
pca_time = time.time() - start_time
|
| 85 |
-
print(green_bold(f"[MusicLIME] PCA completed in {pca_time:.2f}s"))
|
| 86 |
|
| 87 |
-
#
|
| 88 |
-
start_time = time.time()
|
| 89 |
-
print("[MusicLIME] Scaling features (batch)...")
|
| 90 |
audio_scaler = joblib.load("models/fusion/audio_scaler.pkl")
|
| 91 |
-
lyric_scaler = joblib.load("models/fusion/
|
| 92 |
|
|
|
|
| 93 |
scaled_audio_batch = audio_scaler.transform(
|
| 94 |
audio_features_batch
|
| 95 |
) # (batch, 384)
|
| 96 |
scaled_lyrics_batch = lyric_scaler.transform(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 97 |
reduced_lyrics_batch
|
| 98 |
-
) # (batch,
|
| 99 |
|
| 100 |
-
# Step
|
| 101 |
combined_features_batch = np.concatenate(
|
| 102 |
-
[scaled_audio_batch,
|
| 103 |
-
)
|
| 104 |
scaling_time = time.time() - start_time
|
| 105 |
print(green_bold(f"[MusicLIME] Scaling completed in {scaling_time:.2f}s"))
|
| 106 |
|
| 107 |
-
# Step
|
| 108 |
start_time = time.time()
|
| 109 |
print("[MusicLIME] Running MLP predictions (batch)...")
|
| 110 |
if self.classifier is None:
|
| 111 |
self.classifier = build_mlp(
|
| 112 |
input_dim=combined_features_batch.shape[1], config=self.config
|
| 113 |
)
|
| 114 |
-
self.classifier.load_model("models/mlp/
|
| 115 |
|
| 116 |
probabilities, predictions = self.classifier.predict(combined_features_batch)
|
| 117 |
|
|
@@ -122,17 +125,12 @@ class MusicLIMEPredictor:
|
|
| 122 |
|
| 123 |
# Total time summary
|
| 124 |
total_time = (
|
| 125 |
-
preprocessing_time
|
| 126 |
-
+ audio_time
|
| 127 |
-
+ lyrics_time
|
| 128 |
-
+ pca_time
|
| 129 |
-
+ scaling_time
|
| 130 |
-
+ mlp_time
|
| 131 |
)
|
| 132 |
print(f"[MusicLIME] Batch processing complete!")
|
| 133 |
print(
|
| 134 |
green_bold(
|
| 135 |
-
f"[MusicLIME] Total time: {total_time:.2f}s (Preprocessing: {preprocessing_time:.2f}s, Audio: {audio_time:.2f}s, Lyrics: {lyrics_time:.2f}s,
|
| 136 |
)
|
| 137 |
)
|
| 138 |
|
|
|
|
| 38 |
processed_lyrics = []
|
| 39 |
|
| 40 |
for i, (text, audio) in enumerate(zip(texts, audios)):
|
|
|
|
|
|
|
| 41 |
processed_audio, processed_lyric = single_preprocessing(audio, text)
|
| 42 |
processed_audios.append(processed_audio)
|
| 43 |
processed_lyrics.append(processed_lyric)
|
|
|
|
| 72 |
)
|
| 73 |
)
|
| 74 |
|
| 75 |
+
# Step 3: Scale and reduce in batch
|
| 76 |
start_time = time.time()
|
| 77 |
+
print("[MusicLIME] Scaling and reducing features (batch)...")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 78 |
|
| 79 |
+
# Load the trained scalers
|
|
|
|
|
|
|
| 80 |
audio_scaler = joblib.load("models/fusion/audio_scaler.pkl")
|
| 81 |
+
lyric_scaler = joblib.load("models/fusion/lyrics_scaler.pkl")
|
| 82 |
|
| 83 |
+
# Then apply scaling to the batch
|
| 84 |
scaled_audio_batch = audio_scaler.transform(
|
| 85 |
audio_features_batch
|
| 86 |
) # (batch, 384)
|
| 87 |
scaled_lyrics_batch = lyric_scaler.transform(
|
| 88 |
+
lyrics_features_batch
|
| 89 |
+
) # (batch, 2048)
|
| 90 |
+
|
| 91 |
+
# Step 4: Apply PCA to lyrics batch
|
| 92 |
+
print("[MusicLIME] Applying PCA to lyrics (batch)")
|
| 93 |
+
pca_model = joblib.load("models/fusion/pca.pkl")
|
| 94 |
+
reduced_lyrics_batch = pca_model.transform(scaled_lyrics_batch) # (batch, 512)
|
| 95 |
+
|
| 96 |
+
# Step 5: Apply scaler to PCA-scaled lyrics batch
|
| 97 |
+
print("[MusicLIME] Reapplying scaler to PCA-scaled batch")
|
| 98 |
+
pca_scaler = joblib.load("models/fusion/pca_scaler.pkl")
|
| 99 |
+
reduced_lyrics_batch = pca_scaler.transform(
|
| 100 |
reduced_lyrics_batch
|
| 101 |
+
) # (batch, 512)
|
| 102 |
|
| 103 |
+
# Step 6: Concatenate features
|
| 104 |
combined_features_batch = np.concatenate(
|
| 105 |
+
[scaled_audio_batch, reduced_lyrics_batch], axis=1
|
| 106 |
+
) # (batch, sum of lyrics & audio vector dims)
|
| 107 |
scaling_time = time.time() - start_time
|
| 108 |
print(green_bold(f"[MusicLIME] Scaling completed in {scaling_time:.2f}s"))
|
| 109 |
|
| 110 |
+
# Step 7: Batch MLP prediction
|
| 111 |
start_time = time.time()
|
| 112 |
print("[MusicLIME] Running MLP predictions (batch)...")
|
| 113 |
if self.classifier is None:
|
| 114 |
self.classifier = build_mlp(
|
| 115 |
input_dim=combined_features_batch.shape[1], config=self.config
|
| 116 |
)
|
| 117 |
+
self.classifier.load_model("models/mlp/mlp_best.pth")
|
| 118 |
|
| 119 |
probabilities, predictions = self.classifier.predict(combined_features_batch)
|
| 120 |
|
|
|
|
| 125 |
|
| 126 |
# Total time summary
|
| 127 |
total_time = (
|
| 128 |
+
preprocessing_time + audio_time + lyrics_time + scaling_time + mlp_time
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 129 |
)
|
| 130 |
print(f"[MusicLIME] Batch processing complete!")
|
| 131 |
print(
|
| 132 |
green_bold(
|
| 133 |
+
f"[MusicLIME] Total time: {total_time:.2f}s (Preprocessing: {preprocessing_time:.2f}s, Audio: {audio_time:.2f}s, Lyrics: {lyrics_time:.2f}s, Scaling: {scaling_time:.2f}s, MLP: {mlp_time:.2f}s)"
|
| 134 |
)
|
| 135 |
)
|
| 136 |
|
src/preprocessing/audio_preprocessor.py
CHANGED
|
@@ -39,7 +39,7 @@ class AudioPreprocessor:
|
|
| 39 |
|
| 40 |
"""
|
| 41 |
|
| 42 |
-
def __init__(self, script="train", waveform_norm="
|
| 43 |
self.SCRIPT = script
|
| 44 |
self.INPUT_SAMPLING = 48000
|
| 45 |
self.TARGET_SAMPLING = 16000
|
|
@@ -71,7 +71,27 @@ class AudioPreprocessor:
|
|
| 71 |
audiofile = f"{audiofile}.mp3"
|
| 72 |
file = self.INPUT_PATH / audiofile
|
| 73 |
|
| 74 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 75 |
|
| 76 |
elif isinstance(audiofile, (bytes, io.BytesIO)):
|
| 77 |
file = (
|
|
@@ -90,13 +110,20 @@ class AudioPreprocessor:
|
|
| 90 |
else:
|
| 91 |
raise ValueError(f"Unsupported audiofile type: {type(audiofile)}")
|
| 92 |
|
| 93 |
-
#
|
| 94 |
-
if y.
|
| 95 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 96 |
else:
|
| 97 |
-
y = y.T
|
| 98 |
|
| 99 |
waveform = torch.from_numpy(y).float()
|
|
|
|
| 100 |
return waveform, sr
|
| 101 |
|
| 102 |
except Exception as e:
|
|
@@ -182,7 +209,11 @@ class AudioPreprocessor:
|
|
| 182 |
waveform : tensor
|
| 183 |
Normalized audio waveform.
|
| 184 |
"""
|
| 185 |
-
if method == "
|
|
|
|
|
|
|
|
|
|
|
|
|
| 186 |
std = waveform.std()
|
| 187 |
return waveform / max(std, 1e-6)
|
| 188 |
elif method == "minmax":
|
|
@@ -202,7 +233,7 @@ class AudioPreprocessor:
|
|
| 202 |
Base filename to use.
|
| 203 |
"""
|
| 204 |
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
|
| 205 |
-
print(f"Saving {filename} to {self.OUTPUT_PATH}.")
|
| 206 |
|
| 207 |
output_path = self.OUTPUT_PATH / f"{filename}"
|
| 208 |
|
|
@@ -233,7 +264,7 @@ class AudioPreprocessor:
|
|
| 233 |
|
| 234 |
# Convert the audio into mono
|
| 235 |
if waveform.shape[0] > 1:
|
| 236 |
-
print("Current audio is stereo. Converting to mono.")
|
| 237 |
waveform = waveform.mean(dim=0, keepdim=True)
|
| 238 |
|
| 239 |
# If there is a skip value provided, trim it
|
|
@@ -245,7 +276,7 @@ class AudioPreprocessor:
|
|
| 245 |
# Trim if more than 120 seconds, pad if less than
|
| 246 |
waveform = self.pad_trim(waveform=waveform, random_crop=train)
|
| 247 |
|
| 248 |
-
# Normalize waveform (
|
| 249 |
waveform = self.normalize_waveform(waveform, method=self.WAVEFORM_NORM)
|
| 250 |
|
| 251 |
# Add some gaussian noise to the waveform during training
|
|
|
|
| 39 |
|
| 40 |
"""
|
| 41 |
|
| 42 |
+
def __init__(self, script="train", waveform_norm="peak"):
|
| 43 |
self.SCRIPT = script
|
| 44 |
self.INPUT_SAMPLING = 48000
|
| 45 |
self.TARGET_SAMPLING = 16000
|
|
|
|
| 71 |
audiofile = f"{audiofile}.mp3"
|
| 72 |
file = self.INPUT_PATH / audiofile
|
| 73 |
|
| 74 |
+
# FIXED: Force librosa to load properly
|
| 75 |
+
# Load at native sample rate first, then we will resample later
|
| 76 |
+
y, sr = librosa.load(str(file), sr=None, mono=False, dtype=np.float32)
|
| 77 |
+
|
| 78 |
+
# If loading fails (all zeros), try with explicit sample rate
|
| 79 |
+
if np.abs(y).max() < 0.0001:
|
| 80 |
+
print(f"Warning: First load failed, trying with sr=48000")
|
| 81 |
+
y, sr = librosa.load(
|
| 82 |
+
str(file), sr=48000, mono=False, dtype=np.float32
|
| 83 |
+
)
|
| 84 |
+
|
| 85 |
+
# Last resort: use soundfile instead
|
| 86 |
+
if np.abs(y).max() < 0.0001:
|
| 87 |
+
print(f"Warning: Librosa failed, trying soundfile")
|
| 88 |
+
import soundfile as sf
|
| 89 |
+
|
| 90 |
+
y, sr = sf.read(str(file), dtype="float32")
|
| 91 |
+
if y.ndim == 2:
|
| 92 |
+
y = y.T # soundfile returns (samples, channels)
|
| 93 |
+
else:
|
| 94 |
+
y = y[None, :] # make it (1, samples)
|
| 95 |
|
| 96 |
elif isinstance(audiofile, (bytes, io.BytesIO)):
|
| 97 |
file = (
|
|
|
|
| 110 |
else:
|
| 111 |
raise ValueError(f"Unsupported audiofile type: {type(audiofile)}")
|
| 112 |
|
| 113 |
+
# Verify we actually loaded audio
|
| 114 |
+
if np.abs(y).max() < 0.0001:
|
| 115 |
+
raise RuntimeError(
|
| 116 |
+
f"Audio file appears to be silent or corrupted: {audiofile}"
|
| 117 |
+
)
|
| 118 |
+
|
| 119 |
+
# Ensure consistent shape
|
| 120 |
+
if y.ndim == 1:
|
| 121 |
+
y = y[None, :]
|
| 122 |
else:
|
| 123 |
+
y = y.T if y.shape[0] > y.shape[1] else y
|
| 124 |
|
| 125 |
waveform = torch.from_numpy(y).float()
|
| 126 |
+
|
| 127 |
return waveform, sr
|
| 128 |
|
| 129 |
except Exception as e:
|
|
|
|
| 209 |
waveform : tensor
|
| 210 |
Normalized audio waveform.
|
| 211 |
"""
|
| 212 |
+
if method == "peak":
|
| 213 |
+
# Normalize to [-1, 1] based on max absolute value to preserves relative dynamics
|
| 214 |
+
peak = waveform.abs().max()
|
| 215 |
+
return waveform / max(peak, 1e-6)
|
| 216 |
+
elif method == "std":
|
| 217 |
std = waveform.std()
|
| 218 |
return waveform / max(std, 1e-6)
|
| 219 |
elif method == "minmax":
|
|
|
|
| 233 |
Base filename to use.
|
| 234 |
"""
|
| 235 |
self.OUTPUT_PATH.mkdir(parents=True, exist_ok=True)
|
| 236 |
+
# print(f"Saving {filename} to {self.OUTPUT_PATH}.")
|
| 237 |
|
| 238 |
output_path = self.OUTPUT_PATH / f"{filename}"
|
| 239 |
|
|
|
|
| 264 |
|
| 265 |
# Convert the audio into mono
|
| 266 |
if waveform.shape[0] > 1:
|
| 267 |
+
# print("Current audio is stereo. Converting to mono.")
|
| 268 |
waveform = waveform.mean(dim=0, keepdim=True)
|
| 269 |
|
| 270 |
# If there is a skip value provided, trim it
|
|
|
|
| 276 |
# Trim if more than 120 seconds, pad if less than
|
| 277 |
waveform = self.pad_trim(waveform=waveform, random_crop=train)
|
| 278 |
|
| 279 |
+
# Normalize waveform (used PEAK)
|
| 280 |
waveform = self.normalize_waveform(waveform, method=self.WAVEFORM_NORM)
|
| 281 |
|
| 282 |
# Add some gaussian noise to the waveform during training
|
src/preprocessing/preprocessor.py
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
import pandas as pd
|
| 2 |
import numpy as np
|
|
|
|
| 3 |
|
| 4 |
from src.preprocessing.audio_preprocessor import AudioPreprocessor
|
| 5 |
from src.preprocessing.lyrics_preprocessor import LyricsPreprocessor
|
|
@@ -51,6 +52,43 @@ def bulk_preprocessing(batch: pd.DataFrame, batch_count: int):
|
|
| 51 |
return audio_list, lyric_list
|
| 52 |
|
| 53 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 54 |
def single_preprocessing(audio, lyric: str):
|
| 55 |
"""
|
| 56 |
Preprocesses a single record of audio and lyric data
|
|
@@ -82,26 +120,46 @@ def single_preprocessing(audio, lyric: str):
|
|
| 82 |
return processed_song, processed_lyric
|
| 83 |
|
| 84 |
|
| 85 |
-
def dataset_read(batch_size
|
| 86 |
"""
|
| 87 |
-
Reads the
|
|
|
|
| 88 |
|
| 89 |
Parameters
|
| 90 |
----------
|
| 91 |
-
|
|
|
|
| 92 |
|
| 93 |
Returns
|
| 94 |
-------
|
| 95 |
-
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
|
| 99 |
-
List of
|
| 100 |
"""
|
| 101 |
dataset = pd.read_csv(DATASET_CSV)
|
| 102 |
-
label = dataset['target'].tolist()
|
| 103 |
|
| 104 |
-
|
| 105 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
|
| 107 |
-
return
|
|
|
|
| 1 |
import pandas as pd
|
| 2 |
import numpy as np
|
| 3 |
+
import math
|
| 4 |
|
| 5 |
from src.preprocessing.audio_preprocessor import AudioPreprocessor
|
| 6 |
from src.preprocessing.lyrics_preprocessor import LyricsPreprocessor
|
|
|
|
| 52 |
return audio_list, lyric_list
|
| 53 |
|
| 54 |
|
| 55 |
+
def bulk_preprocessing_lyrics(batch: pd.DataFrame, batch_count: int):
|
| 56 |
+
"""
|
| 57 |
+
Applies lyrics preprocessing to a training batch
|
| 58 |
+
|
| 59 |
+
Parameters
|
| 60 |
+
----------
|
| 61 |
+
batch : pd.dataframe
|
| 62 |
+
Dataframe containing the batch data.
|
| 63 |
+
|
| 64 |
+
batch_count : int
|
| 65 |
+
Batch count value.
|
| 66 |
+
|
| 67 |
+
Returns
|
| 68 |
+
-------
|
| 69 |
+
lyric_list : list
|
| 70 |
+
List of loaded lyrics in string form.
|
| 71 |
+
"""
|
| 72 |
+
|
| 73 |
+
lyric_preprocessor = LyricsPreprocessor()
|
| 74 |
+
|
| 75 |
+
lyric_list = []
|
| 76 |
+
count, batch_length = 1, len(batch)
|
| 77 |
+
|
| 78 |
+
print(f"Preprocessing training data with length {batch_length}\n")
|
| 79 |
+
|
| 80 |
+
for row in batch.itertuples():
|
| 81 |
+
print(f"Batch {batch_count} - {count}/{batch_length}")
|
| 82 |
+
|
| 83 |
+
# Preprocess lyric and append to lyric list
|
| 84 |
+
processed_lyric = lyric_preprocessor(lyrics=row.lyrics)
|
| 85 |
+
lyric_list.append(processed_lyric)
|
| 86 |
+
|
| 87 |
+
count += 1
|
| 88 |
+
|
| 89 |
+
return lyric_list
|
| 90 |
+
|
| 91 |
+
|
| 92 |
def single_preprocessing(audio, lyric: str):
|
| 93 |
"""
|
| 94 |
Preprocesses a single record of audio and lyric data
|
|
|
|
| 120 |
return processed_song, processed_lyric
|
| 121 |
|
| 122 |
|
| 123 |
+
def dataset_read(batch_size=20):
|
| 124 |
"""
|
| 125 |
+
Reads the main dataset, splits it into the train/test/valid split, and computes
|
| 126 |
+
optimal number of samples per batch.
|
| 127 |
|
| 128 |
Parameters
|
| 129 |
----------
|
| 130 |
+
batch_size : int
|
| 131 |
+
Number of data per batch
|
| 132 |
|
| 133 |
Returns
|
| 134 |
-------
|
| 135 |
+
split: list[splits]
|
| 136 |
+
A collection of the three splits
|
| 137 |
+
|
| 138 |
+
split_lengths : list[int]
|
| 139 |
+
List of the split lengths
|
| 140 |
"""
|
| 141 |
dataset = pd.read_csv(DATASET_CSV)
|
|
|
|
| 142 |
|
| 143 |
+
train = dataset[dataset["split"] == "train"]
|
| 144 |
+
test = dataset[dataset["split"] == "test"]
|
| 145 |
+
val = dataset[dataset["split"] == "valid"]
|
| 146 |
+
|
| 147 |
+
# Find the minimum split size (ignoring empty splits)
|
| 148 |
+
min_split_size = min([len(train), len(test), len(val)])
|
| 149 |
+
# Clamp batch_size so it never exceeds the smallest split
|
| 150 |
+
effective_batch_size = min(batch_size, min_split_size if min_split_size > 0 else batch_size)
|
| 151 |
+
|
| 152 |
+
def make_splits(df, batch_size):
|
| 153 |
+
if len(df) == 0:
|
| 154 |
+
return []
|
| 155 |
+
n_splits = math.ceil(len(df) / batch_size)
|
| 156 |
+
return np.array_split(df, n_splits)
|
| 157 |
+
|
| 158 |
+
train_splits = make_splits(train, effective_batch_size)
|
| 159 |
+
test_splits = make_splits(test, effective_batch_size)
|
| 160 |
+
val_splits = make_splits(val, effective_batch_size)
|
| 161 |
+
|
| 162 |
+
splits = [train_splits, test_splits, val_splits]
|
| 163 |
+
split_lengths = [len(train), len(test), len(val)]
|
| 164 |
|
| 165 |
+
return splits, split_lengths
|
src/spectttra/spectttra.py
CHANGED
|
@@ -1,6 +1,9 @@
|
|
|
|
|
| 1 |
import torch.nn as nn
|
|
|
|
| 2 |
from .transformer import Transformer
|
| 3 |
from .tokenizer import STTokenizer
|
|
|
|
| 4 |
|
| 5 |
|
| 6 |
class SpecTTTra(nn.Module):
|
|
@@ -112,4 +115,99 @@ class SpecTTTra(nn.Module):
|
|
| 112 |
# Transformer
|
| 113 |
output = self.transformer(spectro_temporal_tokens) # shape: (B, T/t + F/f, dim)
|
| 114 |
|
| 115 |
-
return output
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import torch
|
| 2 |
import torch.nn as nn
|
| 3 |
+
from pathlib import Path
|
| 4 |
from .transformer import Transformer
|
| 5 |
from .tokenizer import STTokenizer
|
| 6 |
+
from src.spectttra.feature import FeatureExtractor
|
| 7 |
|
| 8 |
|
| 9 |
class SpecTTTra(nn.Module):
|
|
|
|
| 115 |
# Transformer
|
| 116 |
output = self.transformer(spectro_temporal_tokens) # shape: (B, T/t + F/f, dim)
|
| 117 |
|
| 118 |
+
return output
|
| 119 |
+
|
| 120 |
+
|
| 121 |
+
def build_spectttra_from_cfg(cfg, device):
|
| 122 |
+
"""
|
| 123 |
+
Constructs the SpecTTTra model and its associated FeatureExtractor from a given configuration.
|
| 124 |
+
|
| 125 |
+
Args:
|
| 126 |
+
cfg (SimpleNamespace): Configuration object containing model and feature extraction parameters. Expected attributes include:
|
| 127 |
+
- cfg.melspec.n_mels: Number of mel frequency bins.
|
| 128 |
+
- cfg.model: Model-specific parameters (e.g., embed_dim, t_clip, f_clip, etc.).
|
| 129 |
+
device (torch.device): The device on which the model and feature extractor will be allocated (e.g., 'cpu' or 'cuda').
|
| 130 |
+
|
| 131 |
+
Returns:
|
| 132 |
+
tuple:
|
| 133 |
+
FeatureExtractor: Initialized feature extraction module moved to the specified device.
|
| 134 |
+
SpecTTTra: Constructed SpecTTTra model moved to the specified device.
|
| 135 |
+
"""
|
| 136 |
+
|
| 137 |
+
feat_ext = FeatureExtractor(cfg).to(device)
|
| 138 |
+
|
| 139 |
+
# The pre-trained model expects specific, fixed input dimensions.
|
| 140 |
+
# Hardcoded to ensure the model architecture matches the checkpoint weights exactly.
|
| 141 |
+
# The expected number of frames (n_frames) is taken directly from the RuntimeError message.
|
| 142 |
+
n_mels = cfg.melspec.n_mels # n_mels should be 128
|
| 143 |
+
n_frames = 3744 # n_frames match the checkpoint's expectation
|
| 144 |
+
|
| 145 |
+
print(f"[INFO] Initializing SpecTTTra with fixed dimensions: n_mels={n_mels}, n_frames={n_frames}")
|
| 146 |
+
|
| 147 |
+
model_cfg = cfg.model
|
| 148 |
+
model = SpecTTTra(
|
| 149 |
+
input_spec_dim=n_mels,
|
| 150 |
+
input_temp_dim=n_frames,
|
| 151 |
+
embed_dim=model_cfg.embed_dim,
|
| 152 |
+
t_clip=model_cfg.t_clip,
|
| 153 |
+
f_clip=model_cfg.f_clip,
|
| 154 |
+
num_heads=model_cfg.num_heads,
|
| 155 |
+
num_layers=model_cfg.num_layers,
|
| 156 |
+
pre_norm=model_cfg.pre_norm,
|
| 157 |
+
pe_learnable=model_cfg.pe_learnable,
|
| 158 |
+
pos_drop_rate=model_cfg.pos_drop_rate,
|
| 159 |
+
attn_drop_rate=model_cfg.attn_drop_rate,
|
| 160 |
+
proj_drop_rate=model_cfg.proj_drop_rate,
|
| 161 |
+
mlp_ratio=model_cfg.mlp_ratio,
|
| 162 |
+
).to(device)
|
| 163 |
+
|
| 164 |
+
return feat_ext, model
|
| 165 |
+
|
| 166 |
+
|
| 167 |
+
def load_frozen_spectttra(model, ckpt_path, device):
|
| 168 |
+
"""
|
| 169 |
+
Loads pretrained SpecTTTra weights from a frozen checkpoint file.
|
| 170 |
+
|
| 171 |
+
Args:
|
| 172 |
+
model (torch.nn.Module): An initialized SpecTTTra model instance to load weights into.
|
| 173 |
+
ckpt_path (str or Path): Path to the pretrained model checkpoint file (e.g., 'spectttra_frozen.pth').
|
| 174 |
+
device (torch.device): The device to map the loaded weights to (e.g., 'cpu' or 'cuda').
|
| 175 |
+
|
| 176 |
+
Returns:
|
| 177 |
+
model (torch.nn.Module): The SpecTTTra model with loaded pretrained weights, set to evaluation mode.
|
| 178 |
+
|
| 179 |
+
Raises:
|
| 180 |
+
FileNotFoundError: If the specified checkpoint file does not exist at `ckpt_path`.
|
| 181 |
+
"""
|
| 182 |
+
ckpt_path = Path(ckpt_path)
|
| 183 |
+
if not ckpt_path.exists():
|
| 184 |
+
raise FileNotFoundError(
|
| 185 |
+
f"Pre-trained model not found at {ckpt_path}. "
|
| 186 |
+
"Please download 'pytorch_model.bin', rename to 'spectttra_frozen.pth', "
|
| 187 |
+
"and place it in the correct directory."
|
| 188 |
+
)
|
| 189 |
+
|
| 190 |
+
print(f"[INFO] Found SpecTTTra checkpoint at {ckpt_path}. Loading weights...")
|
| 191 |
+
state = torch.load(ckpt_path, map_location=device)
|
| 192 |
+
|
| 193 |
+
new_state_dict = {}
|
| 194 |
+
for k, v in state.items():
|
| 195 |
+
if k.startswith("encoder."):
|
| 196 |
+
new_key = k[len("encoder."):]
|
| 197 |
+
new_state_dict[new_key] = v
|
| 198 |
+
else:
|
| 199 |
+
new_state_dict[k] = v
|
| 200 |
+
|
| 201 |
+
# Now that the shapes match, this should load without a size mismatch error.
|
| 202 |
+
missing_keys, unexpected_keys = model.load_state_dict(new_state_dict, strict=False)
|
| 203 |
+
if missing_keys:
|
| 204 |
+
# Might see a few missing keys if your SpecTTTra class is slightly different, but the core should load.
|
| 205 |
+
print(f"[WARNING] Missing keys in model: {missing_keys}")
|
| 206 |
+
if unexpected_keys:
|
| 207 |
+
# Seeing 'classifier' or 'ft_extractor' keys here is NORMAL and SAFE.
|
| 208 |
+
print(f"[INFO] Unused keys in checkpoint: {unexpected_keys}")
|
| 209 |
+
|
| 210 |
+
print("[INFO] Successfully loaded pre-trained SpecTTTra weights.")
|
| 211 |
+
|
| 212 |
+
model.eval()
|
| 213 |
+
return model
|
src/spectttra/spectttra_trainer.py
CHANGED
|
@@ -1,11 +1,10 @@
|
|
| 1 |
import threading
|
| 2 |
import torch
|
| 3 |
import numpy as np
|
| 4 |
-
from pathlib import Path
|
| 5 |
from types import SimpleNamespace
|
| 6 |
|
| 7 |
from src.spectttra.feature import FeatureExtractor
|
| 8 |
-
from src.spectttra.spectttra import SpecTTTra
|
| 9 |
|
| 10 |
# Shared variables for the model and setup, loaded only once and reused (cache)
|
| 11 |
_PREDICTOR_LOCK = threading.Lock()
|
|
@@ -17,54 +16,10 @@ _DEVICE = None
|
|
| 17 |
|
| 18 |
def build_spectttra(cfg, device):
|
| 19 |
"""
|
| 20 |
-
|
| 21 |
-
|
| 22 |
-
Args:
|
| 23 |
-
cfg (SimpleNamespace): Configuration containing audio, mel-spectrogram, and model parameters.
|
| 24 |
-
device (torch.device): Target device for model and feature extractor.
|
| 25 |
-
|
| 26 |
-
Returns:
|
| 27 |
-
tuple:
|
| 28 |
-
FeatureExtractor: Module for converting raw audio into mel-spectrogram features.
|
| 29 |
-
SpecTTTra: Spectro-temporal transformer model initialized with checkpoint weights.
|
| 30 |
"""
|
| 31 |
-
feat_ext =
|
| 32 |
-
|
| 33 |
-
# Build model once using placeholder input to infer mel and frame dimensions
|
| 34 |
-
with torch.no_grad():
|
| 35 |
-
dummy_wave = torch.zeros(1, cfg.audio.max_len, device=device)
|
| 36 |
-
dummy_mel = feat_ext(dummy_wave.float())
|
| 37 |
-
_, n_mels, n_frames = dummy_mel.shape
|
| 38 |
-
|
| 39 |
-
model_cfg = cfg.model
|
| 40 |
-
model = SpecTTTra(
|
| 41 |
-
input_spec_dim=n_mels,
|
| 42 |
-
input_temp_dim=n_frames,
|
| 43 |
-
embed_dim=model_cfg.embed_dim,
|
| 44 |
-
t_clip=model_cfg.t_clip,
|
| 45 |
-
f_clip=model_cfg.f_clip,
|
| 46 |
-
num_heads=model_cfg.num_heads,
|
| 47 |
-
num_layers=model_cfg.num_layers,
|
| 48 |
-
pre_norm=model_cfg.pre_norm,
|
| 49 |
-
pe_learnable=model_cfg.pe_learnable,
|
| 50 |
-
pos_drop_rate=model_cfg.pos_drop_rate,
|
| 51 |
-
attn_drop_rate=model_cfg.attn_drop_rate,
|
| 52 |
-
proj_drop_rate=model_cfg.proj_drop_rate,
|
| 53 |
-
mlp_ratio=model_cfg.mlp_ratio,
|
| 54 |
-
).to(device)
|
| 55 |
-
|
| 56 |
-
# Load frozen checkpoint if it exists; otherwise, save initial state
|
| 57 |
-
ckpt_path = Path("models/spectttra/spectttra_frozen.pth")
|
| 58 |
-
if ckpt_path.exists():
|
| 59 |
-
state = torch.load(ckpt_path, map_location=device)
|
| 60 |
-
model.load_state_dict(state)
|
| 61 |
-
print(f"[INFO] Loaded frozen SpecTTTra checkpoint from {ckpt_path}")
|
| 62 |
-
else:
|
| 63 |
-
ckpt_path.parent.mkdir(parents=True, exist_ok=True)
|
| 64 |
-
torch.save(model.state_dict(), ckpt_path)
|
| 65 |
-
print(f"[INFO] Saved frozen SpecTTTra checkpoint to {ckpt_path}")
|
| 66 |
-
|
| 67 |
-
model.eval()
|
| 68 |
return feat_ext, model
|
| 69 |
|
| 70 |
|
|
@@ -118,20 +73,14 @@ def _init_predictor_once():
|
|
| 118 |
)
|
| 119 |
|
| 120 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
| 121 |
-
|
| 122 |
feat_ext, model = build_spectttra(cfg, device)
|
| 123 |
-
|
| 124 |
feat_ext.to(device)
|
| 125 |
|
| 126 |
# Move model to device (GPU if available) and allow faster inference with mixed precision
|
| 127 |
-
model.to(device)
|
| 128 |
-
model.eval()
|
| 129 |
|
| 130 |
# Cache
|
| 131 |
-
_FEAT_EXT = feat_ext
|
| 132 |
-
_MODEL = model
|
| 133 |
-
_CFG = cfg
|
| 134 |
-
_DEVICE = device
|
| 135 |
|
| 136 |
|
| 137 |
def spectttra_predict(audio_tensor):
|
|
@@ -147,6 +96,7 @@ def spectttra_predict(audio_tensor):
|
|
| 147 |
1D embedding vector of shape (embed_dim,). The embedding is obtained
|
| 148 |
by mean-pooling the transformer token outputs.
|
| 149 |
"""
|
|
|
|
| 150 |
global _FEAT_EXT, _MODEL, _CFG, _DEVICE
|
| 151 |
|
| 152 |
_init_predictor_once()
|
|
@@ -161,18 +111,25 @@ def spectttra_predict(audio_tensor):
|
|
| 161 |
|
| 162 |
with torch.no_grad():
|
| 163 |
# Extract mel-spectrogram
|
| 164 |
-
melspec = feat_ext(waveform)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 165 |
|
| 166 |
if device.type == "cuda":
|
| 167 |
with torch.cuda.amp.autocast(enabled=True):
|
| 168 |
-
tokens = model(melspec)
|
| 169 |
-
pooled = tokens.mean(dim=1)
|
| 170 |
else:
|
| 171 |
tokens = model(melspec)
|
| 172 |
pooled = tokens.mean(dim=1)
|
| 173 |
|
| 174 |
-
|
| 175 |
-
out = pooled.squeeze(0).cpu().numpy() # (embed_dim,)
|
| 176 |
return out
|
| 177 |
|
| 178 |
|
|
@@ -203,19 +160,31 @@ def spectttra_train(audio_tensors):
|
|
| 203 |
model = _MODEL
|
| 204 |
device = _DEVICE
|
| 205 |
|
| 206 |
-
batch
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 210 |
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
tokens = model(melspec) # (B, num_tokens, embed_dim)
|
| 214 |
-
pooled = tokens.mean(dim=1) # (B, embed_dim)
|
| 215 |
-
else:
|
| 216 |
tokens = model(melspec)
|
| 217 |
pooled = tokens.mean(dim=1)
|
| 218 |
-
|
| 219 |
-
|
|
|
|
| 220 |
|
| 221 |
-
return
|
|
|
|
| 1 |
import threading
|
| 2 |
import torch
|
| 3 |
import numpy as np
|
|
|
|
| 4 |
from types import SimpleNamespace
|
| 5 |
|
| 6 |
from src.spectttra.feature import FeatureExtractor
|
| 7 |
+
from src.spectttra.spectttra import SpecTTTra, build_spectttra_from_cfg, load_frozen_spectttra
|
| 8 |
|
| 9 |
# Shared variables for the model and setup, loaded only once and reused (cache)
|
| 10 |
_PREDICTOR_LOCK = threading.Lock()
|
|
|
|
| 16 |
|
| 17 |
def build_spectttra(cfg, device):
|
| 18 |
"""
|
| 19 |
+
Wrapper that builds SpecTTTra + FeatureExtractor and loads frozen checkpoint.
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
"""
|
| 21 |
+
feat_ext, model = build_spectttra_from_cfg(cfg, device)
|
| 22 |
+
model = load_frozen_spectttra(model, "models/spectttra/spectttra_frozen.pth", device)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
return feat_ext, model
|
| 24 |
|
| 25 |
|
|
|
|
| 73 |
)
|
| 74 |
|
| 75 |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
|
|
|
|
| 76 |
feat_ext, model = build_spectttra(cfg, device)
|
|
|
|
| 77 |
feat_ext.to(device)
|
| 78 |
|
| 79 |
# Move model to device (GPU if available) and allow faster inference with mixed precision
|
| 80 |
+
model.to(device).eval()
|
|
|
|
| 81 |
|
| 82 |
# Cache
|
| 83 |
+
_FEAT_EXT, _MODEL, _CFG, _DEVICE = feat_ext, model, cfg, device
|
|
|
|
|
|
|
|
|
|
| 84 |
|
| 85 |
|
| 86 |
def spectttra_predict(audio_tensor):
|
|
|
|
| 96 |
1D embedding vector of shape (embed_dim,). The embedding is obtained
|
| 97 |
by mean-pooling the transformer token outputs.
|
| 98 |
"""
|
| 99 |
+
|
| 100 |
global _FEAT_EXT, _MODEL, _CFG, _DEVICE
|
| 101 |
|
| 102 |
_init_predictor_once()
|
|
|
|
| 111 |
|
| 112 |
with torch.no_grad():
|
| 113 |
# Extract mel-spectrogram
|
| 114 |
+
melspec = feat_ext(waveform)
|
| 115 |
+
|
| 116 |
+
# Ensure melspec shape matches model's expectation ---
|
| 117 |
+
expected_frames = model.input_temp_dim # expected_frames is 3744
|
| 118 |
+
if melspec.shape[2] > expected_frames:
|
| 119 |
+
melspec = melspec[:, :, :expected_frames]
|
| 120 |
+
elif melspec.shape[2] < expected_frames:
|
| 121 |
+
padding = expected_frames - melspec.shape[2]
|
| 122 |
+
melspec = torch.nn.functional.pad(melspec, (0, padding))
|
| 123 |
|
| 124 |
if device.type == "cuda":
|
| 125 |
with torch.cuda.amp.autocast(enabled=True):
|
| 126 |
+
tokens = model(melspec)
|
| 127 |
+
pooled = tokens.mean(dim=1)
|
| 128 |
else:
|
| 129 |
tokens = model(melspec)
|
| 130 |
pooled = tokens.mean(dim=1)
|
| 131 |
|
| 132 |
+
out = pooled.squeeze(0).cpu().numpy()
|
|
|
|
| 133 |
return out
|
| 134 |
|
| 135 |
|
|
|
|
| 160 |
model = _MODEL
|
| 161 |
device = _DEVICE
|
| 162 |
|
| 163 |
+
# Refactors the loop to be a much faster single-batch operation
|
| 164 |
+
try:
|
| 165 |
+
waveforms_batch = torch.cat(audio_tensors, dim=0).to(device).float()
|
| 166 |
+
except Exception as e:
|
| 167 |
+
print(f"[INFO] Error during tensor concatenation, falling back to loop. Fix preprocessing for speed. Error: {e}")
|
| 168 |
+
batch_list = [spectttra_predict(w) for w in audio_tensors]
|
| 169 |
+
return np.array(batch_list)
|
| 170 |
+
|
| 171 |
+
with torch.no_grad():
|
| 172 |
+
melspec = feat_ext(waveforms_batch)
|
| 173 |
+
|
| 174 |
+
# Ensure melspec shape matches model's expectation
|
| 175 |
+
expected_frames = model.input_temp_dim # expected_frames is 3744
|
| 176 |
+
if melspec.shape[2] > expected_frames:
|
| 177 |
+
melspec = melspec[:, :, :expected_frames]
|
| 178 |
+
elif melspec.shape[2] < expected_frames:
|
| 179 |
+
padding = expected_frames - melspec.shape[2]
|
| 180 |
+
melspec = torch.nn.functional.pad(melspec, (0, padding))
|
| 181 |
|
| 182 |
+
if device.type == "cuda":
|
| 183 |
+
with torch.cuda.amp.autocast(enabled=True):
|
|
|
|
|
|
|
|
|
|
| 184 |
tokens = model(melspec)
|
| 185 |
pooled = tokens.mean(dim=1)
|
| 186 |
+
else:
|
| 187 |
+
tokens = model(melspec)
|
| 188 |
+
pooled = tokens.mean(dim=1)
|
| 189 |
|
| 190 |
+
return pooled.cpu().numpy()
|
src/utils/config_loader.py
CHANGED
|
@@ -9,7 +9,11 @@ BASE_DIR = Path(config["base_dir"]).resolve()
|
|
| 9 |
|
| 10 |
# Resolve paths
|
| 11 |
DATASET_NPZ = BASE_DIR / config["paths"]["dataset_npz"]
|
|
|
|
| 12 |
DATASET_CSV = BASE_DIR / config["paths"]["dataset_csv"]
|
| 13 |
RAW_DIR = BASE_DIR / config["paths"]["raw_dir"]
|
| 14 |
PROCESSED_DIR = BASE_DIR / config["paths"]["processed_dir"]
|
| 15 |
-
PCA_MODEL = BASE_DIR / config["paths"]["pca_path"]
|
|
|
|
|
|
|
|
|
|
|
|
| 9 |
|
| 10 |
# Resolve paths
|
| 11 |
DATASET_NPZ = BASE_DIR / config["paths"]["dataset_npz"]
|
| 12 |
+
RAW_DATASET_NPZ = BASE_DIR / config["paths"]["raw_dataset_npz"]
|
| 13 |
DATASET_CSV = BASE_DIR / config["paths"]["dataset_csv"]
|
| 14 |
RAW_DIR = BASE_DIR / config["paths"]["raw_dir"]
|
| 15 |
PROCESSED_DIR = BASE_DIR / config["paths"]["processed_dir"]
|
| 16 |
+
PCA_MODEL = BASE_DIR / config["paths"]["pca_path"]
|
| 17 |
+
AUDIO_SCALER = BASE_DIR / config["paths"]["audio_scaler"]
|
| 18 |
+
LYRICS_SCALER = BASE_DIR / config["paths"]["lyrics_scaler"]
|
| 19 |
+
PCA_SCALER = BASE_DIR / config["paths"]["pca_scaler"]
|
src/utils/dataset.py
CHANGED
|
@@ -1,45 +1,132 @@
|
|
| 1 |
-
from sklearn.preprocessing import StandardScaler
|
| 2 |
from sklearn.model_selection import train_test_split
|
|
|
|
|
|
|
|
|
|
| 3 |
|
| 4 |
import joblib
|
| 5 |
import numpy as np
|
| 6 |
import logging
|
|
|
|
| 7 |
|
| 8 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
| 9 |
logger = logging.getLogger(__name__)
|
| 10 |
|
| 11 |
|
| 12 |
-
def dataset_splitter(X: np.ndarray, Y: np.ndarray):
|
| 13 |
"""
|
| 14 |
-
|
|
|
|
| 15 |
|
| 16 |
Parameters
|
| 17 |
----------
|
| 18 |
X : np.array
|
| 19 |
-
|
| 20 |
Y : np.array
|
| 21 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 22 |
|
| 23 |
Returns
|
| 24 |
-------
|
| 25 |
-
data : dict
|
| 26 |
-
A dictionary of np.arrays,
|
|
|
|
| 27 |
"""
|
| 28 |
|
| 29 |
logger.info(f"Dataset shape: {X.shape}, Labels: {len(Y)}")
|
| 30 |
logger.info(f"Class distribution: {np.bincount(Y)}")
|
| 31 |
|
| 32 |
-
#
|
| 33 |
X_train, X_test, y_train, y_test = train_test_split(
|
| 34 |
-
X, Y, test_size=0.1, random_state=42, stratify=Y
|
| 35 |
)
|
| 36 |
-
|
|
|
|
| 37 |
X_train, X_val, y_train, y_val = train_test_split(
|
| 38 |
X_train, y_train, test_size=0.2222, random_state=42, stratify=y_train
|
| 39 |
)
|
| 40 |
-
|
| 41 |
logger.info(f"Train: {X_train.shape}, Validation: {X_val.shape}, Test: {X_test.shape}")
|
| 42 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 43 |
data = {
|
| 44 |
"train": (X_train, y_train),
|
| 45 |
"val": (X_val, y_val),
|
|
@@ -49,6 +136,92 @@ def dataset_splitter(X: np.ndarray, Y: np.ndarray):
|
|
| 49 |
return data
|
| 50 |
|
| 51 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 52 |
def dataset_scaler(audio: np.ndarray, lyrics: np.ndarray):
|
| 53 |
"""
|
| 54 |
Method to scale both audio and lyric vectors using Z-Score.
|
|
@@ -68,7 +241,7 @@ def dataset_scaler(audio: np.ndarray, lyrics: np.ndarray):
|
|
| 68 |
-------
|
| 69 |
scaled_audio : np.array
|
| 70 |
Array of scaled audio features
|
| 71 |
-
|
| 72 |
Array of scaled lyric features
|
| 73 |
"""
|
| 74 |
|
|
@@ -76,14 +249,11 @@ def dataset_scaler(audio: np.ndarray, lyrics: np.ndarray):
|
|
| 76 |
audio_scaler = StandardScaler().fit(audio)
|
| 77 |
lyric_scaler = StandardScaler().fit(lyrics)
|
| 78 |
|
| 79 |
-
scaled_audio = audio_scaler.transform(audio)
|
| 80 |
-
scaled_lyrics = lyric_scaler.transform(lyrics)
|
| 81 |
-
|
| 82 |
# Save the trained scalers for prediction
|
| 83 |
-
joblib.dump(audio_scaler,
|
| 84 |
-
joblib.dump(lyric_scaler,
|
| 85 |
|
| 86 |
-
return
|
| 87 |
|
| 88 |
|
| 89 |
def instance_scaler(audio: np.ndarray, lyrics: np.ndarray):
|
|
@@ -101,15 +271,15 @@ def instance_scaler(audio: np.ndarray, lyrics: np.ndarray):
|
|
| 101 |
-------
|
| 102 |
scaled_audio : np.array
|
| 103 |
Array of scaled audio feature
|
| 104 |
-
|
| 105 |
Array of scaled lyric feature
|
| 106 |
"""
|
| 107 |
|
| 108 |
# Apply scalers to the single inputs
|
| 109 |
-
audio_scaler = joblib.load(
|
| 110 |
-
lyric_scaler = joblib.load(
|
| 111 |
|
| 112 |
scaled_audio = audio_scaler.transform([audio])
|
| 113 |
-
|
| 114 |
|
| 115 |
-
return scaled_audio,
|
|
|
|
| 1 |
+
from sklearn.preprocessing import StandardScaler, MinMaxScaler
|
| 2 |
from sklearn.model_selection import train_test_split
|
| 3 |
+
from src.utils.config_loader import AUDIO_SCALER, LYRICS_SCALER, PCA_SCALER
|
| 4 |
+
from sklearn.decomposition import IncrementalPCA
|
| 5 |
+
from src.utils.config_loader import PCA_MODEL
|
| 6 |
|
| 7 |
import joblib
|
| 8 |
import numpy as np
|
| 9 |
import logging
|
| 10 |
+
import pandas as pd
|
| 11 |
|
| 12 |
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
|
| 13 |
logger = logging.getLogger(__name__)
|
| 14 |
|
| 15 |
|
| 16 |
+
def dataset_splitter(X: np.ndarray, Y: np.ndarray, ids: np.ndarray = None):
|
| 17 |
"""
|
| 18 |
+
Splits X, Y (and optional ids) into train/val/test sets.
|
| 19 |
+
Saves metadata CSVs for each split if ids are provided.
|
| 20 |
|
| 21 |
Parameters
|
| 22 |
----------
|
| 23 |
X : np.array
|
| 24 |
+
Feature vectors
|
| 25 |
Y : np.array
|
| 26 |
+
Labels
|
| 27 |
+
ids : np.array, optional
|
| 28 |
+
Identifiers (filenames or row indices)
|
| 29 |
+
save_metadata : bool
|
| 30 |
+
Whether to save split metadata CSVs
|
| 31 |
+
outdir : str
|
| 32 |
+
Directory to save metadata CSVs
|
| 33 |
|
| 34 |
Returns
|
| 35 |
-------
|
| 36 |
+
data : dict
|
| 37 |
+
A dictionary of np.arrays: {train, val, test}
|
| 38 |
+
Each value is a tuple (X_split, y_split, ids_split if provided)
|
| 39 |
"""
|
| 40 |
|
| 41 |
logger.info(f"Dataset shape: {X.shape}, Labels: {len(Y)}")
|
| 42 |
logger.info(f"Class distribution: {np.bincount(Y)}")
|
| 43 |
|
| 44 |
+
# First split: train vs test
|
| 45 |
X_train, X_test, y_train, y_test = train_test_split(
|
| 46 |
+
X, Y, ids, test_size=0.1, random_state=42, stratify=Y
|
| 47 |
)
|
| 48 |
+
|
| 49 |
+
# Second split: train vs val
|
| 50 |
X_train, X_val, y_train, y_val = train_test_split(
|
| 51 |
X_train, y_train, test_size=0.2222, random_state=42, stratify=y_train
|
| 52 |
)
|
| 53 |
+
|
| 54 |
logger.info(f"Train: {X_train.shape}, Validation: {X_val.shape}, Test: {X_test.shape}")
|
| 55 |
|
| 56 |
+
data = {
|
| 57 |
+
"train": (X_train, y_train),
|
| 58 |
+
"val": (X_val, y_val),
|
| 59 |
+
"test": (X_test, y_test),
|
| 60 |
+
}
|
| 61 |
+
|
| 62 |
+
return data
|
| 63 |
+
|
| 64 |
+
|
| 65 |
+
def scale_pca(data : dict):
|
| 66 |
+
"""
|
| 67 |
+
Script that scales the splits, and applies PCA to the lyrics vector.
|
| 68 |
+
|
| 69 |
+
Parameters
|
| 70 |
+
----------
|
| 71 |
+
data : dictionary
|
| 72 |
+
Dictionary containing the splits
|
| 73 |
+
|
| 74 |
+
Returns
|
| 75 |
+
-------
|
| 76 |
+
data : dict{np.array}
|
| 77 |
+
A dictionary of np.arrays, containing the train/test/val split.
|
| 78 |
+
"""
|
| 79 |
+
|
| 80 |
+
# Destructure the dictionary to get data split
|
| 81 |
+
X_train, y_train = data["train"]
|
| 82 |
+
X_val, y_val = data["val"]
|
| 83 |
+
X_test, y_test = data["test"]
|
| 84 |
+
|
| 85 |
+
# Segment the concatenated embedding to audio and lyrics
|
| 86 |
+
X_train_audio, X_train_lyrics = X_train[:, :384], X_train[:, 384:]
|
| 87 |
+
X_test_audio, X_test_lyrics = X_test[:, :384], X_test[:, 384:]
|
| 88 |
+
X_val_audio, X_val_lyrics = X_val[:, :384], X_val[:, 384:]
|
| 89 |
+
|
| 90 |
+
# Fit the scalers into the train data, return scalers for fitting of test and validation
|
| 91 |
+
audio_scaler, lyric_scaler = dataset_scaler(X_train_audio, X_train_lyrics)
|
| 92 |
+
|
| 93 |
+
# Transform the rest of the splits using the scalers
|
| 94 |
+
X_train_audio = audio_scaler.transform(X_train_audio)
|
| 95 |
+
X_test_audio = audio_scaler.transform(X_test_audio)
|
| 96 |
+
X_val_audio = audio_scaler.transform(X_val_audio)
|
| 97 |
+
|
| 98 |
+
X_train_lyrics = lyric_scaler.transform(X_train_lyrics)
|
| 99 |
+
X_test_lyrics = lyric_scaler.transform(X_test_lyrics)
|
| 100 |
+
X_val_lyrics = lyric_scaler.transform(X_val_lyrics)
|
| 101 |
+
|
| 102 |
+
# Fit PCA on TRAINING lyrics only
|
| 103 |
+
ipca = IncrementalPCA(n_components=512)
|
| 104 |
+
batch_size = 1000
|
| 105 |
+
|
| 106 |
+
for i in range(0, X_train_lyrics.shape[0], batch_size):
|
| 107 |
+
ipca.partial_fit(X_train_lyrics[i:i + batch_size])
|
| 108 |
+
|
| 109 |
+
# Transform in batches
|
| 110 |
+
X_train_lyrics = ipca.transform(X_train_lyrics)
|
| 111 |
+
X_test_lyrics = ipca.transform(X_test_lyrics)
|
| 112 |
+
X_val_lyrics = ipca.transform(X_val_lyrics)
|
| 113 |
+
|
| 114 |
+
# Apply scaler to the PCA output
|
| 115 |
+
pca_lyric_scaler = StandardScaler().fit(X_train_lyrics)
|
| 116 |
+
|
| 117 |
+
X_train_lyrics = pca_lyric_scaler.transform(X_train_lyrics)
|
| 118 |
+
X_test_lyrics = pca_lyric_scaler.transform(X_test_lyrics)
|
| 119 |
+
X_val_lyrics = pca_lyric_scaler.transform(X_val_lyrics)
|
| 120 |
+
|
| 121 |
+
# Concatenate them back to their original form, but scaled
|
| 122 |
+
X_train = np.concatenate([X_train_audio, X_train_lyrics], axis=1)
|
| 123 |
+
X_test = np.concatenate([X_test_audio, X_test_lyrics], axis=1)
|
| 124 |
+
X_val = np.concatenate([X_val_audio, X_val_lyrics], axis=1)
|
| 125 |
+
|
| 126 |
+
joblib.dump(ipca, PCA_MODEL)
|
| 127 |
+
# Save the trained scalers for prediction
|
| 128 |
+
joblib.dump(pca_lyric_scaler, PCA_SCALER)
|
| 129 |
+
|
| 130 |
data = {
|
| 131 |
"train": (X_train, y_train),
|
| 132 |
"val": (X_val, y_val),
|
|
|
|
| 136 |
return data
|
| 137 |
|
| 138 |
|
| 139 |
+
def scale_pca_lyrics(data : dict):
|
| 140 |
+
"""
|
| 141 |
+
Script that scales the splits, and applies PCA to the lyrics vector.
|
| 142 |
+
|
| 143 |
+
Parameters
|
| 144 |
+
----------
|
| 145 |
+
data : dictionary
|
| 146 |
+
Dictionary containing the splits
|
| 147 |
+
|
| 148 |
+
Returns
|
| 149 |
+
-------
|
| 150 |
+
data : dict{np.array}
|
| 151 |
+
A dictionary of np.arrays, containing the train/test/val split.
|
| 152 |
+
"""
|
| 153 |
+
|
| 154 |
+
# Destructure the dictionary to get data split
|
| 155 |
+
X_train, y_train = data["train"]
|
| 156 |
+
X_val, y_val = data["val"]
|
| 157 |
+
X_test, y_test = data["test"]
|
| 158 |
+
|
| 159 |
+
lyric_scaler = StandardScaler().fit(X_train)
|
| 160 |
+
joblib.dump(lyric_scaler, LYRICS_SCALER)
|
| 161 |
+
|
| 162 |
+
X_train = lyric_scaler.transform(X_train)
|
| 163 |
+
X_test = lyric_scaler.transform(X_test)
|
| 164 |
+
X_val = lyric_scaler.transform(X_val)
|
| 165 |
+
|
| 166 |
+
# Fit PCA on TRAINING lyrics only
|
| 167 |
+
ipca = IncrementalPCA(n_components=512)
|
| 168 |
+
batch_size = 1000
|
| 169 |
+
|
| 170 |
+
for i in range(0, X_train.shape[0], batch_size):
|
| 171 |
+
ipca.partial_fit(X_train[i:i + batch_size])
|
| 172 |
+
|
| 173 |
+
# Transform in batches
|
| 174 |
+
X_train = ipca.transform(X_train)
|
| 175 |
+
X_test = ipca.transform(X_test)
|
| 176 |
+
X_val = ipca.transform(X_val)
|
| 177 |
+
|
| 178 |
+
joblib.dump(ipca, PCA_MODEL)
|
| 179 |
+
|
| 180 |
+
data = {
|
| 181 |
+
"train": (X_train, y_train),
|
| 182 |
+
"val": (X_val, y_val),
|
| 183 |
+
"test": (X_test, y_test),
|
| 184 |
+
}
|
| 185 |
+
|
| 186 |
+
return data
|
| 187 |
+
|
| 188 |
+
|
| 189 |
+
def scale(data : dict):
|
| 190 |
+
"""
|
| 191 |
+
Script that scales the splits, and applies PCA to the lyrics vector.
|
| 192 |
+
|
| 193 |
+
Parameters
|
| 194 |
+
----------
|
| 195 |
+
data : dictionary
|
| 196 |
+
Dictionary containing the splits
|
| 197 |
+
|
| 198 |
+
Returns
|
| 199 |
+
-------
|
| 200 |
+
data : dict{np.array}
|
| 201 |
+
A dictionary of np.arrays, containing the train/test/val split.
|
| 202 |
+
"""
|
| 203 |
+
|
| 204 |
+
# Destructure the dictionary to get data split
|
| 205 |
+
X_train, y_train = data["train"]
|
| 206 |
+
X_val, y_val = data["val"]
|
| 207 |
+
X_test, y_test = data["test"]
|
| 208 |
+
|
| 209 |
+
audio_scaler = StandardScaler(with_mean=False).fit(X_train)
|
| 210 |
+
joblib.dump(audio_scaler, AUDIO_SCALER)
|
| 211 |
+
|
| 212 |
+
# Transform the rest of the splits using the scalers
|
| 213 |
+
X_train = audio_scaler.transform(X_train)
|
| 214 |
+
X_test = audio_scaler.transform(X_test)
|
| 215 |
+
X_val = audio_scaler.transform(X_val)
|
| 216 |
+
|
| 217 |
+
data = {
|
| 218 |
+
"train": (X_train, y_train),
|
| 219 |
+
"val": (X_val, y_val),
|
| 220 |
+
"test": (X_test, y_test),
|
| 221 |
+
}
|
| 222 |
+
|
| 223 |
+
return data
|
| 224 |
+
|
| 225 |
def dataset_scaler(audio: np.ndarray, lyrics: np.ndarray):
|
| 226 |
"""
|
| 227 |
Method to scale both audio and lyric vectors using Z-Score.
|
|
|
|
| 241 |
-------
|
| 242 |
scaled_audio : np.array
|
| 243 |
Array of scaled audio features
|
| 244 |
+
scaleds : np.array
|
| 245 |
Array of scaled lyric features
|
| 246 |
"""
|
| 247 |
|
|
|
|
| 249 |
audio_scaler = StandardScaler().fit(audio)
|
| 250 |
lyric_scaler = StandardScaler().fit(lyrics)
|
| 251 |
|
|
|
|
|
|
|
|
|
|
| 252 |
# Save the trained scalers for prediction
|
| 253 |
+
joblib.dump(audio_scaler, AUDIO_SCALER)
|
| 254 |
+
joblib.dump(lyric_scaler, LYRICS_SCALER)
|
| 255 |
|
| 256 |
+
return audio_scaler, lyric_scaler
|
| 257 |
|
| 258 |
|
| 259 |
def instance_scaler(audio: np.ndarray, lyrics: np.ndarray):
|
|
|
|
| 271 |
-------
|
| 272 |
scaled_audio : np.array
|
| 273 |
Array of scaled audio feature
|
| 274 |
+
scaleds : np.array
|
| 275 |
Array of scaled lyric feature
|
| 276 |
"""
|
| 277 |
|
| 278 |
# Apply scalers to the single inputs
|
| 279 |
+
audio_scaler = joblib.load(AUDIO_SCALER)
|
| 280 |
+
lyric_scaler = joblib.load(LYRICS_SCALER)
|
| 281 |
|
| 282 |
scaled_audio = audio_scaler.transform([audio])
|
| 283 |
+
scaled_lyric = lyric_scaler.transform(lyrics)
|
| 284 |
|
| 285 |
+
return scaled_audio, scaled_lyric
|