Spaces:

krislette
/

bach-or-bot

Running

File size: 6,737 Bytes

from src.preprocessing.preprocessor import dataset_read, bulk_preprocessing
from src.spectttra.spectttra_trainer import spectttra_train
from src.llm2vectrain.model import load_llm2vec_model
from src.llm2vectrain.llm2vec_trainer import l2vec_train
from src.models.mlp import build_mlp, load_config

from src.utils.config_loader import DATASET_NPZ

from pathlib import Path
from src.utils.config_loader import DATASET_NPZ, RAW_DATASET_NPZ
from src.utils.dataset import scale_pca

import numpy as np
import logging

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)


def train_mlp_model(data : dict):
    """
    Train the MLP model with extracted features.
    
    Parameters
    ----------
        data : dict{np.array}
            A dictionary of np.arrays, containing the train/test/val split.
    Parameters
    ----------
        data : dict{np.array}
            A dictionary of np.arrays, containing the train/test/val split.
    """
    logger.info("Starting MLP training...")
    
    # Load MLP configuration
    config = load_config("config/model_config.yml")

    # Destructure the dictionary to get data split
    X_train, y_train = data["train"]
    X_val, y_val     = data["val"]
    X_test, y_test   = data["test"]

    # Destructure the dictionary to get data split
    X_train, y_train = data["train"]
    X_val, y_val     = data["val"]
    X_test, y_test   = data["test"]
    
    # Build and train MLP
    mlp_classifier = build_mlp(input_dim=X_train.shape[1], config=config)
    
    # Show model summary
    mlp_classifier.get_model_summary()
    
    # Train the model
    history = mlp_classifier.train(X_train, y_train, X_val, y_val)
    
    # Load best model and evaluate on test set
    try:
        mlp_classifier.load_model("models/mlp/mlp_best.pth")
        mlp_classifier.load_model("models/mlp/mlp_best.pth")
        logger.info("Loaded best model for final evaluation")
    except FileNotFoundError:
        logger.warning("Best model not found, using current model")
    
    # Final evaluation
    test_results = mlp_classifier.evaluate(X_test, y_test)


    # Save final model
    mlp_classifier.save_model("models/mlp/mlp_multimodal.pth")
    mlp_classifier.save_model("models/mlp/mlp_multimodal.pth")
    
    logger.info("MLP training completed successfully!")
    logger.info(f"Final test accuracy: {test_results['test_accuracy']:.2f}%")
    
    return mlp_classifier



def train_pipeline():
    """
    Training script which includes preprocessing, feature extraction, and training the MLP model.

    The train pipeline saves the train dataset in an .npz format.

    Parameters
    ----------
    None

    Returns
    -------
    None
    """

    # Set constant sizes
    BATCH_SIZE = 200
    AUDIO_SIZE = 384
    LYRIC_SIZE = 2048

    dataset_path = Path(RAW_DATASET_NPZ)

    if dataset_path.exists():
        logger.info("Training dataset already exists. Loading file...")

        loaded_data = np.load(RAW_DATASET_NPZ)
        data = {
            "train": (loaded_data["X_train"], loaded_data["y_train"]),
            "test":  (loaded_data["X_test"], loaded_data["y_test"]),
            "val":   (loaded_data["X_val"], loaded_data["y_val"]),
        }
    else:
        logger.info("Training dataset does not exist. Processing data...")
        logger.info("Training dataset does not exist. Processing data...")
        # Get batches from dataset and return full Y labels
        splits, split_lengths = dataset_read(batch_size=BATCH_SIZE)
        batch_count = 1

        # Instantiate LLM2Vec Model
        l2v = load_llm2vec_model()

        # Preallocate arrays
        X_train = np.zeros((split_lengths[0], AUDIO_SIZE + LYRIC_SIZE), dtype=np.float32)
        X_test  = np.zeros((split_lengths[1], AUDIO_SIZE + LYRIC_SIZE), dtype=np.float32)
        X_val   = np.zeros((split_lengths[2], AUDIO_SIZE + LYRIC_SIZE), dtype=np.float32)

        y_train = np.zeros(split_lengths[0], dtype=np.int32)
        y_test  = np.zeros(split_lengths[1], dtype=np.int32)
        y_val   = np.zeros(split_lengths[2], dtype=np.int32)

        X_splits = [X_train, X_test, X_val]
        y_splits = [y_train, y_test, y_val]

        # Loop through the three splits
        for split_idx, split in enumerate(splits):
            start_idx = 0

            # Loop through batches for each split
            for batch in split:
                if len(batch) == 0:
                    continue  # skip empty batch safely
            
                logger.info(f"Bulk Preprocessing batch {batch_count}...")
                audio, lyrics = bulk_preprocessing(batch, batch_count)
                batch_labels = batch['target'].values

                # Extract audio features
                logger.info("Starting SpecTTTra feature extraction...")
                audio_features = spectttra_train(audio)

                # Call the train method for LLM2Vec
                logger.info(f"\nStarting LLM2Vec feature extraction...")
                lyric_features = l2vec_train(l2v, lyrics)

                # Concatenate the two features
                batch_feature = np.concatenate([audio_features, lyric_features], axis=1)

                # Allocate them to the preallocated blocks
                bsz = batch_feature.shape[0]
                X_splits[split_idx][start_idx:start_idx + bsz, :] = batch_feature
                y_splits[split_idx][start_idx:start_idx + bsz] = batch_labels

                logger.info(f"Batch {batch_count}: {bsz} samples, start_idx={start_idx}")

                batch_count += 1
                start_idx += bsz

        # Save raw (unscaled) dataset
        logger.info("Saving raw dataset...")
        np.savez(
            RAW_DATASET_NPZ,
            X_train=X_train, y_train=y_train,
            X_val=X_val,     y_val=y_val,
            X_test=X_test,   y_test=y_test,
        )

        # Run scaling
        logger.info("Running standard scaling...")
        data = {
            "train": (X_train, y_train),
            "val":   (X_val, y_val),
            "test":  (X_test, y_test),
        }

    # Scale and use PCA fitting for all raw data
    logger.info("Scaling and applying PCA...")
    data = scale_pca(data)

    # Save scaled dataset
    X_train, y_train = data["train"]
    X_val, y_val     = data["val"]
    X_test, y_test   = data["test"]

    logger.info("Saving scaled dataset...")
    np.savez(
        DATASET_NPZ,
        X_train=X_train, y_train=y_train,
        X_val=X_val,     y_val=y_val,
        X_test=X_test,   y_test=y_test,
    )

    logger.info("Starting MLP training...")
    train_mlp_model(data)

if __name__ == "__main__":
    train_pipeline()