bach-or-bot / scripts /train.py
krislette's picture
Auto-deploy from GitHub: 5ac21603a8274a2350875ec7db1bd58cbf2ee539
75d43d2
from src.preprocessing.preprocessor import dataset_read, bulk_preprocessing
from src.spectttra.spectttra_trainer import spectttra_train
from src.llm2vectrain.model import load_llm2vec_model
from src.llm2vectrain.llm2vec_trainer import l2vec_train
from src.models.mlp import build_mlp, load_config
from src.utils.config_loader import DATASET_NPZ
from pathlib import Path
from src.utils.config_loader import DATASET_NPZ, RAW_DATASET_NPZ
from src.utils.dataset import scale_pca
import numpy as np
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
def train_mlp_model(data : dict):
"""
Train the MLP model with extracted features.
Parameters
----------
data : dict{np.array}
A dictionary of np.arrays, containing the train/test/val split.
Parameters
----------
data : dict{np.array}
A dictionary of np.arrays, containing the train/test/val split.
"""
logger.info("Starting MLP training...")
# Load MLP configuration
config = load_config("config/model_config.yml")
# Destructure the dictionary to get data split
X_train, y_train = data["train"]
X_val, y_val = data["val"]
X_test, y_test = data["test"]
# Destructure the dictionary to get data split
X_train, y_train = data["train"]
X_val, y_val = data["val"]
X_test, y_test = data["test"]
# Build and train MLP
mlp_classifier = build_mlp(input_dim=X_train.shape[1], config=config)
# Show model summary
mlp_classifier.get_model_summary()
# Train the model
history = mlp_classifier.train(X_train, y_train, X_val, y_val)
# Load best model and evaluate on test set
try:
mlp_classifier.load_model("models/mlp/mlp_best.pth")
mlp_classifier.load_model("models/mlp/mlp_best.pth")
logger.info("Loaded best model for final evaluation")
except FileNotFoundError:
logger.warning("Best model not found, using current model")
# Final evaluation
test_results = mlp_classifier.evaluate(X_test, y_test)
# Save final model
mlp_classifier.save_model("models/mlp/mlp_multimodal.pth")
mlp_classifier.save_model("models/mlp/mlp_multimodal.pth")
logger.info("MLP training completed successfully!")
logger.info(f"Final test accuracy: {test_results['test_accuracy']:.2f}%")
return mlp_classifier
def train_pipeline():
"""
Training script which includes preprocessing, feature extraction, and training the MLP model.
The train pipeline saves the train dataset in an .npz format.
Parameters
----------
None
Returns
-------
None
"""
# Set constant sizes
BATCH_SIZE = 200
AUDIO_SIZE = 384
LYRIC_SIZE = 2048
dataset_path = Path(RAW_DATASET_NPZ)
if dataset_path.exists():
logger.info("Training dataset already exists. Loading file...")
loaded_data = np.load(RAW_DATASET_NPZ)
data = {
"train": (loaded_data["X_train"], loaded_data["y_train"]),
"test": (loaded_data["X_test"], loaded_data["y_test"]),
"val": (loaded_data["X_val"], loaded_data["y_val"]),
}
else:
logger.info("Training dataset does not exist. Processing data...")
logger.info("Training dataset does not exist. Processing data...")
# Get batches from dataset and return full Y labels
splits, split_lengths = dataset_read(batch_size=BATCH_SIZE)
batch_count = 1
# Instantiate LLM2Vec Model
l2v = load_llm2vec_model()
# Preallocate arrays
X_train = np.zeros((split_lengths[0], AUDIO_SIZE + LYRIC_SIZE), dtype=np.float32)
X_test = np.zeros((split_lengths[1], AUDIO_SIZE + LYRIC_SIZE), dtype=np.float32)
X_val = np.zeros((split_lengths[2], AUDIO_SIZE + LYRIC_SIZE), dtype=np.float32)
y_train = np.zeros(split_lengths[0], dtype=np.int32)
y_test = np.zeros(split_lengths[1], dtype=np.int32)
y_val = np.zeros(split_lengths[2], dtype=np.int32)
X_splits = [X_train, X_test, X_val]
y_splits = [y_train, y_test, y_val]
# Loop through the three splits
for split_idx, split in enumerate(splits):
start_idx = 0
# Loop through batches for each split
for batch in split:
if len(batch) == 0:
continue # skip empty batch safely
logger.info(f"Bulk Preprocessing batch {batch_count}...")
audio, lyrics = bulk_preprocessing(batch, batch_count)
batch_labels = batch['target'].values
# Extract audio features
logger.info("Starting SpecTTTra feature extraction...")
audio_features = spectttra_train(audio)
# Call the train method for LLM2Vec
logger.info(f"\nStarting LLM2Vec feature extraction...")
lyric_features = l2vec_train(l2v, lyrics)
# Concatenate the two features
batch_feature = np.concatenate([audio_features, lyric_features], axis=1)
# Allocate them to the preallocated blocks
bsz = batch_feature.shape[0]
X_splits[split_idx][start_idx:start_idx + bsz, :] = batch_feature
y_splits[split_idx][start_idx:start_idx + bsz] = batch_labels
logger.info(f"Batch {batch_count}: {bsz} samples, start_idx={start_idx}")
batch_count += 1
start_idx += bsz
# Save raw (unscaled) dataset
logger.info("Saving raw dataset...")
np.savez(
RAW_DATASET_NPZ,
X_train=X_train, y_train=y_train,
X_val=X_val, y_val=y_val,
X_test=X_test, y_test=y_test,
)
# Run scaling
logger.info("Running standard scaling...")
data = {
"train": (X_train, y_train),
"val": (X_val, y_val),
"test": (X_test, y_test),
}
# Scale and use PCA fitting for all raw data
logger.info("Scaling and applying PCA...")
data = scale_pca(data)
# Save scaled dataset
X_train, y_train = data["train"]
X_val, y_val = data["val"]
X_test, y_test = data["test"]
logger.info("Saving scaled dataset...")
np.savez(
DATASET_NPZ,
X_train=X_train, y_train=y_train,
X_val=X_val, y_val=y_val,
X_test=X_test, y_test=y_test,
)
logger.info("Starting MLP training...")
train_mlp_model(data)
if __name__ == "__main__":
train_pipeline()