Spaces:
Running
Running
| from sklearn.preprocessing import StandardScaler | |
| from sklearn.model_selection import train_test_split | |
| from src.utils.config_loader import AUDIO_SCALER, LYRICS_SCALER #, PCA_SCALER | |
| from sklearn.decomposition import IncrementalPCA | |
| from src.utils.config_loader import PCA_MODEL | |
| import joblib | |
| import numpy as np | |
| import logging | |
| logging.basicConfig( | |
| level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" | |
| ) | |
| logger = logging.getLogger(__name__) | |
| def dataset_splitter(X: np.ndarray, Y: np.ndarray, ids: np.ndarray = None): | |
| """ | |
| Splits X, Y (and optional ids) into train/val/test sets. | |
| Saves metadata CSVs for each split if ids are provided. | |
| Parameters | |
| ---------- | |
| X : np.array | |
| Feature vectors | |
| Y : np.array | |
| Labels | |
| ids : np.array, optional | |
| Identifiers (filenames or row indices) | |
| save_metadata : bool | |
| Whether to save split metadata CSVs | |
| outdir : str | |
| Directory to save metadata CSVs | |
| Returns | |
| ------- | |
| data : dict | |
| A dictionary of np.arrays: {train, val, test} | |
| Each value is a tuple (X_split, y_split, ids_split if provided) | |
| """ | |
| logger.info(f"Dataset shape: {X.shape}, Labels: {len(Y)}") | |
| logger.info(f"Class distribution: {np.bincount(Y)}") | |
| # First split: train vs test | |
| X_train, X_test, y_train, y_test = train_test_split( | |
| X, Y, ids, test_size=0.1, random_state=42, stratify=Y | |
| ) | |
| # Second split: train vs val | |
| X_train, X_val, y_train, y_val = train_test_split( | |
| X_train, y_train, test_size=0.2222, random_state=42, stratify=y_train | |
| ) | |
| logger.info( | |
| f"Train: {X_train.shape}, Validation: {X_val.shape}, Test: {X_test.shape}" | |
| ) | |
| data = { | |
| "train": (X_train, y_train), | |
| "val": (X_val, y_val), | |
| "test": (X_test, y_test), | |
| } | |
| return data | |
| def scale_pca(data: dict): | |
| """ | |
| Script that scales the splits, and applies PCA to the lyrics vector. | |
| Parameters | |
| ---------- | |
| data : dictionary | |
| Dictionary containing the splits | |
| Returns | |
| ------- | |
| data : dict{np.array} | |
| A dictionary of np.arrays, containing the train/test/val split. | |
| """ | |
| # Destructure the dictionary to get data split | |
| X_train, y_train = data["train"] | |
| X_val, y_val = data["val"] | |
| X_test, y_test = data["test"] | |
| # Segment the concatenated embedding to audio and lyrics | |
| X_train_audio, X_train_lyrics = X_train[:, :384], X_train[:, 384:] | |
| X_test_audio, X_test_lyrics = X_test[:, :384], X_test[:, 384:] | |
| X_val_audio, X_val_lyrics = X_val[:, :384], X_val[:, 384:] | |
| # Fit the scalers into the train data, return scalers for fitting of test and validation | |
| audio_scaler, lyric_scaler = dataset_scaler(X_train_audio, X_train_lyrics) | |
| # Transform the rest of the splits using the scalers | |
| X_train_audio = audio_scaler.transform(X_train_audio) | |
| X_test_audio = audio_scaler.transform(X_test_audio) | |
| X_val_audio = audio_scaler.transform(X_val_audio) | |
| X_train_lyrics = lyric_scaler.transform(X_train_lyrics) | |
| X_test_lyrics = lyric_scaler.transform(X_test_lyrics) | |
| X_val_lyrics = lyric_scaler.transform(X_val_lyrics) | |
| # Fit PCA on TRAINING lyrics only | |
| ipca = IncrementalPCA(n_components=512) | |
| batch_size = 1000 | |
| for i in range(0, X_train_lyrics.shape[0], batch_size): | |
| ipca.partial_fit(X_train_lyrics[i : i + batch_size]) | |
| # Transform in batches | |
| X_train_lyrics = ipca.transform(X_train_lyrics) | |
| X_test_lyrics = ipca.transform(X_test_lyrics) | |
| X_val_lyrics = ipca.transform(X_val_lyrics) | |
| # NOTE: Scaling after PCA produces underperforming models compared to non-scaling. | |
| # One can toggle it on for experimentation/testing purposes. | |
| # pca_lyric_scaler = StandardScaler().fit(X_train_lyrics) | |
| # X_train_lyrics = pca_lyric_scaler.transform(X_train_lyrics) | |
| # X_test_lyrics = pca_lyric_scaler.transform(X_test_lyrics) | |
| # X_val_lyrics = pca_lyric_scaler.transform(X_val_lyrics) | |
| # Concatenate them back to their original form, but scaled | |
| X_train = np.concatenate([X_train_audio, X_train_lyrics], axis=1) | |
| X_test = np.concatenate([X_test_audio, X_test_lyrics], axis=1) | |
| X_val = np.concatenate([X_val_audio, X_val_lyrics], axis=1) | |
| joblib.dump(ipca, PCA_MODEL) | |
| # Save the trained scalers for prediction | |
| # joblib.dump(pca_lyric_scaler, PCA_SCALER) | |
| data = { | |
| "train": (X_train, y_train), | |
| "val": (X_val, y_val), | |
| "test": (X_test, y_test), | |
| } | |
| return data | |
| def scale_pca_lyrics(data: dict): | |
| """ | |
| Script that scales the splits, and applies PCA to the lyrics vector. | |
| Parameters | |
| ---------- | |
| data : dictionary | |
| Dictionary containing the splits | |
| Returns | |
| ------- | |
| data : dict{np.array} | |
| A dictionary of np.arrays, containing the train/test/val split. | |
| """ | |
| # Destructure the dictionary to get data split | |
| X_train, y_train = data["train"] | |
| X_val, y_val = data["val"] | |
| X_test, y_test = data["test"] | |
| lyric_scaler = StandardScaler().fit(X_train) | |
| joblib.dump(lyric_scaler, LYRICS_SCALER) | |
| X_train = lyric_scaler.transform(X_train) | |
| X_test = lyric_scaler.transform(X_test) | |
| X_val = lyric_scaler.transform(X_val) | |
| # Fit PCA on TRAINING lyrics only | |
| ipca = IncrementalPCA(n_components=512) | |
| batch_size = 1000 | |
| for i in range(0, X_train.shape[0], batch_size): | |
| ipca.partial_fit(X_train[i : i + batch_size]) | |
| # Transform in batches | |
| X_train = ipca.transform(X_train) | |
| X_test = ipca.transform(X_test) | |
| X_val = ipca.transform(X_val) | |
| joblib.dump(ipca, PCA_MODEL) | |
| data = { | |
| "train": (X_train, y_train), | |
| "val": (X_val, y_val), | |
| "test": (X_test, y_test), | |
| } | |
| return data | |
| def scale(data: dict): | |
| """ | |
| Script that scales the splits, and applies PCA to the lyrics vector. | |
| Parameters | |
| ---------- | |
| data : dictionary | |
| Dictionary containing the splits | |
| Returns | |
| ------- | |
| data : dict{np.array} | |
| A dictionary of np.arrays, containing the train/test/val split. | |
| """ | |
| # Destructure the dictionary to get data split | |
| X_train, y_train = data["train"] | |
| X_val, y_val = data["val"] | |
| X_test, y_test = data["test"] | |
| audio_scaler = StandardScaler(with_mean=False).fit(X_train) | |
| joblib.dump(audio_scaler, AUDIO_SCALER) | |
| # Transform the rest of the splits using the scalers | |
| X_train = audio_scaler.transform(X_train) | |
| X_test = audio_scaler.transform(X_test) | |
| X_val = audio_scaler.transform(X_val) | |
| data = { | |
| "train": (X_train, y_train), | |
| "val": (X_val, y_val), | |
| "test": (X_test, y_test), | |
| } | |
| return data | |
| def dataset_scaler(audio: np.ndarray, lyrics: np.ndarray): | |
| """ | |
| Method to scale both audio and lyric vectors using Z-Score. | |
| This allows us to have both vectors with a mean of 0, and ranges up and down based on the | |
| standard deviation without compromising the information it contains. | |
| This also saves the scalers through joblib, which will be loaded in the predict script. | |
| Parameters | |
| ---------- | |
| audio : np.array | |
| Array of audio features | |
| lyrics : np.array | |
| Array of lyric features | |
| Returns | |
| ------- | |
| scaled_audio : np.array | |
| Array of scaled audio features | |
| scaleds : np.array | |
| Array of scaled lyric features | |
| """ | |
| # Apply scalers to have similar-ranged data for both audio and lyrics training values | |
| audio_scaler = StandardScaler().fit(audio) | |
| lyric_scaler = StandardScaler().fit(lyrics) | |
| # Save the trained scalers for prediction | |
| joblib.dump(audio_scaler, AUDIO_SCALER) | |
| joblib.dump(lyric_scaler, LYRICS_SCALER) | |
| return audio_scaler, lyric_scaler | |
| def instance_scaler(audio: np.ndarray, lyrics: np.ndarray): | |
| """ | |
| Method to scale the single input audio and lyrics | |
| Parameters | |
| ---------- | |
| audio : np.array | |
| Instance of an audio feature | |
| lyrics : np.array | |
| Instance of a lyric feature | |
| Returns | |
| ------- | |
| scaled_audio : np.array | |
| Array of scaled audio feature | |
| scaleds : np.array | |
| Array of scaled lyric feature | |
| """ | |
| # Apply scalers to the single inputs | |
| audio_scaler = joblib.load(AUDIO_SCALER) | |
| lyric_scaler = joblib.load(LYRICS_SCALER) | |
| scaled_audio = audio_scaler.transform(audio) | |
| scaled_lyric = lyric_scaler.transform(lyrics) | |
| return scaled_audio, scaled_lyric | |
| def audio_instance_scaler(audio: np.ndarray): | |
| """ | |
| Method to scale the single input audio | |
| Parameters | |
| ---------- | |
| audio : np.array | |
| Instance of an audio feature | |
| Returns | |
| ------- | |
| scaled_audio : np.array | |
| Array of scaled audio feature | |
| """ | |
| # Apply scaler to the single inputs | |
| audio_scaler = joblib.load(AUDIO_SCALER) | |
| scaled_audio = audio_scaler.transform(audio) | |
| return scaled_audio | |