Spaces:

krislette
/

bach-or-bot

Running

Acelle Krislette Rosales

Initial commit: Added application code

fc7b4a9 2 months ago

5.35 kB

	from src.preprocessing.preprocessor import dataset_read, bulk_preprocessing
	from src.spectttra.spectttra_trainer import spectttra_train
	from src.llm2vectrain.model import load_llm2vec_model
	from src.llm2vectrain.llm2vec_trainer import l2vec_train
	from src.models.mlp import build_mlp, load_config

	from src.utils.config_loader import DATASET_NPZ, PCA_MODEL
	from src.utils.dataset import dataset_scaler, dataset_splitter
	from sklearn.decomposition import PCA

	from pathlib import Path
	import numpy as np
	import logging
	import joblib

	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
	logger = logging.getLogger(__name__)


	def train_mlp_model(data : dict):
	"""
	Train the MLP model with extracted features.

	Parameters
	----------
	data : dict{np.array}
	A dictionary of np.arrays, containing the train/test/val split.
	"""
	logger.info("Starting MLP training...")

	# Load MLP configuration
	config = load_config("config/model_config.yml")

	# Destructure the dictionary to get data split
	X_train, y_train = data["train"]
	X_val, y_val = data["val"]
	X_test, y_test = data["test"]

	# Build and train MLP
	mlp_classifier = build_mlp(input_dim=X_train.shape[1], config=config)

	# Show model summary
	mlp_classifier.get_model_summary()

	# Train the model
	history = mlp_classifier.train(X_train, y_train, X_val, y_val)

	# Load best model and evaluate on test set
	try:
	mlp_classifier.load_model("models/mlp/mlp_best.pth")
	logger.info("Loaded best model for final evaluation")
	except FileNotFoundError:
	logger.warning("Best model not found, using current model")

	# Final evaluation
	test_results = mlp_classifier.evaluate(X_test, y_test)

	# Save final model
	mlp_classifier.save_model("models/mlp/mlp_multimodal.pth")

	logger.info("MLP training completed successfully!")
	logger.info(f"Final test accuracy: {test_results['test_accuracy']:.2f}%")

	return mlp_classifier


	def train_pipeline():
	"""
	Training script which includes preprocessing, feature extraction, and training the MLP model.

	The train pipeline saves the train dataset in an .npz format.

	Parameters
	----------
	None

	Returns
	-------
	None
	"""

	# Instantiate X and Y vectors
	X, Y = None, None

	dataset_path = Path(DATASET_NPZ)

	if dataset_path.exists():
	logger.info("Training dataset already exists. Loading file...")

	loaded_data = np.load(DATASET_NPZ)
	X = loaded_data["X"]
	Y = loaded_data["Y"]
	else:
	logger.info("Training dataset does not exist. Processing data...")
	# Get batches from dataset and return full Y labels
	batches, Y = dataset_read(batch_size=500)
	batch_count = 1

	# Instantiate LLM2Vec and PCA model
	llm2vec_model = load_llm2vec_model()

	# Preallocate spaces for both audio and lyric vectors to reduce memory overhead
	audio_vectors = np.zeros((len(Y), 384), dtype=np.float32)
	lyric_vectors = np.zeros((len(Y), 4096), dtype=np.float32)

	start_idx = 0
	for batch in batches:

	logger.info(f"Bulk Preprocessing - Batch {batch_count}.")
	audio, lyrics = bulk_preprocessing(batch, batch_count)
	batch_count += 1

	# Call the train methods for both SpecTTTra and LLM2Vec
	logger.info("Starting SpecTTTra feature extraction...")
	audio_features = spectttra_train(audio)

	logger.info("Starting LLM2Vec feature extraction...")
	lyrics_features = l2vec_train(llm2vec_model, lyrics)

	batch_size = audio_features.shape[0]

	# Store the results on preallocated spaces
	audio_vectors[start_idx:start_idx + batch_size, :] = audio_features
	lyric_vectors[start_idx:start_idx + batch_size, :] = lyrics_features

	# Delete stored instance for next batch to remove overhead
	del audio, lyrics, audio_features, lyrics_features

	# Run standard scaling on audio and lyrics separately
	logger.info("Running standard scaling for audio and lyrics...")
	audio_vectors, lyric_vectors = dataset_scaler(audio_vectors, lyric_vectors)

	# Start training the PCA to the collected lyrics features
	logger.info("PCA Training on lyric vectors...")
	pca = PCA(n_components=256, svd_solver="randomized", random_state=42)
	lyric_vectors = pca.fit_transform(lyric_vectors)

	# Save the trained PCA model
	joblib.dump(pca, "models/fusion/pca.pkl")

	# Concatenate audio features and reduced lyrics features
	X = np.concatenate([audio_vectors, lyric_vectors], axis=1)
	logger.info(f"Audio and Lyrics Concatenated. Final features shape: {X.shape}")

	# Convert label list into np.array
	Y = np.array(Y)

	# Save both X and Y to an .npz file for easier loading
	logger.info("Saving dataset for future testing...")
	np.savez(DATASET_NPZ, X=X, Y=Y)

	# Do data splitting
	data = dataset_splitter(X, Y)

	logger.info("Starting MLP training...")
	train_mlp_model(data)


	if __name__ == "__main__":
	train_pipeline()