Spaces:
Running
Running
| from src.preprocessing.preprocessor import dataset_read, bulk_preprocessing | |
| from src.spectttra.spectttra_trainer import spectttra_train | |
| from src.llm2vectrain.model import load_llm2vec_model | |
| from src.llm2vectrain.llm2vec_trainer import l2vec_train | |
| from src.models.mlp import build_mlp, load_config | |
| from src.utils.config_loader import DATASET_NPZ, PCA_MODEL | |
| from src.utils.dataset import dataset_scaler, dataset_splitter | |
| from sklearn.decomposition import PCA | |
| from pathlib import Path | |
| import numpy as np | |
| import logging | |
| import joblib | |
| logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s') | |
| logger = logging.getLogger(__name__) | |
| def train_mlp_model(data : dict): | |
| """ | |
| Train the MLP model with extracted features. | |
| Parameters | |
| ---------- | |
| data : dict{np.array} | |
| A dictionary of np.arrays, containing the train/test/val split. | |
| """ | |
| logger.info("Starting MLP training...") | |
| # Load MLP configuration | |
| config = load_config("config/model_config.yml") | |
| # Destructure the dictionary to get data split | |
| X_train, y_train = data["train"] | |
| X_val, y_val = data["val"] | |
| X_test, y_test = data["test"] | |
| # Build and train MLP | |
| mlp_classifier = build_mlp(input_dim=X_train.shape[1], config=config) | |
| # Show model summary | |
| mlp_classifier.get_model_summary() | |
| # Train the model | |
| history = mlp_classifier.train(X_train, y_train, X_val, y_val) | |
| # Load best model and evaluate on test set | |
| try: | |
| mlp_classifier.load_model("models/mlp/mlp_best.pth") | |
| logger.info("Loaded best model for final evaluation") | |
| except FileNotFoundError: | |
| logger.warning("Best model not found, using current model") | |
| # Final evaluation | |
| test_results = mlp_classifier.evaluate(X_test, y_test) | |
| # Save final model | |
| mlp_classifier.save_model("models/mlp/mlp_multimodal.pth") | |
| logger.info("MLP training completed successfully!") | |
| logger.info(f"Final test accuracy: {test_results['test_accuracy']:.2f}%") | |
| return mlp_classifier | |
| def train_pipeline(): | |
| """ | |
| Training script which includes preprocessing, feature extraction, and training the MLP model. | |
| The train pipeline saves the train dataset in an .npz format. | |
| Parameters | |
| ---------- | |
| None | |
| Returns | |
| ------- | |
| None | |
| """ | |
| # Instantiate X and Y vectors | |
| X, Y = None, None | |
| dataset_path = Path(DATASET_NPZ) | |
| if dataset_path.exists(): | |
| logger.info("Training dataset already exists. Loading file...") | |
| loaded_data = np.load(DATASET_NPZ) | |
| X = loaded_data["X"] | |
| Y = loaded_data["Y"] | |
| else: | |
| logger.info("Training dataset does not exist. Processing data...") | |
| # Get batches from dataset and return full Y labels | |
| batches, Y = dataset_read(batch_size=500) | |
| batch_count = 1 | |
| # Instantiate LLM2Vec and PCA model | |
| llm2vec_model = load_llm2vec_model() | |
| # Preallocate spaces for both audio and lyric vectors to reduce memory overhead | |
| audio_vectors = np.zeros((len(Y), 384), dtype=np.float32) | |
| lyric_vectors = np.zeros((len(Y), 4096), dtype=np.float32) | |
| start_idx = 0 | |
| for batch in batches: | |
| logger.info(f"Bulk Preprocessing - Batch {batch_count}.") | |
| audio, lyrics = bulk_preprocessing(batch, batch_count) | |
| batch_count += 1 | |
| # Call the train methods for both SpecTTTra and LLM2Vec | |
| logger.info("Starting SpecTTTra feature extraction...") | |
| audio_features = spectttra_train(audio) | |
| logger.info("Starting LLM2Vec feature extraction...") | |
| lyrics_features = l2vec_train(llm2vec_model, lyrics) | |
| batch_size = audio_features.shape[0] | |
| # Store the results on preallocated spaces | |
| audio_vectors[start_idx:start_idx + batch_size, :] = audio_features | |
| lyric_vectors[start_idx:start_idx + batch_size, :] = lyrics_features | |
| # Delete stored instance for next batch to remove overhead | |
| del audio, lyrics, audio_features, lyrics_features | |
| # Run standard scaling on audio and lyrics separately | |
| logger.info("Running standard scaling for audio and lyrics...") | |
| audio_vectors, lyric_vectors = dataset_scaler(audio_vectors, lyric_vectors) | |
| # Start training the PCA to the collected lyrics features | |
| logger.info("PCA Training on lyric vectors...") | |
| pca = PCA(n_components=256, svd_solver="randomized", random_state=42) | |
| lyric_vectors = pca.fit_transform(lyric_vectors) | |
| # Save the trained PCA model | |
| joblib.dump(pca, "models/fusion/pca.pkl") | |
| # Concatenate audio features and reduced lyrics features | |
| X = np.concatenate([audio_vectors, lyric_vectors], axis=1) | |
| logger.info(f"Audio and Lyrics Concatenated. Final features shape: {X.shape}") | |
| # Convert label list into np.array | |
| Y = np.array(Y) | |
| # Save both X and Y to an .npz file for easier loading | |
| logger.info("Saving dataset for future testing...") | |
| np.savez(DATASET_NPZ, X=X, Y=Y) | |
| # Do data splitting | |
| data = dataset_splitter(X, Y) | |
| logger.info("Starting MLP training...") | |
| train_mlp_model(data) | |
| if __name__ == "__main__": | |
| train_pipeline() |