import pandas as pd import numpy as np import math from src.preprocessing.audio_preprocessor import AudioPreprocessor from src.preprocessing.lyrics_preprocessor import LyricsPreprocessor from src.utils.config_loader import DATASET_CSV def bulk_preprocessing(batch: pd.DataFrame, batch_count: int): """ Applies audio and lyrics preprocessing to a training batch Parameters ---------- batch : pd.dataframe Dataframe containing the batch data. batch_count : int Batch count value. Returns ------- audio_list : list List of loaded audio in float form. lyric_list : list List of loaded lyrics in string form. """ audio_preprocessor = AudioPreprocessor(script="train") lyric_preprocessor = LyricsPreprocessor() audio_list, lyric_list = [], [] count, batch_length = 1, len(batch) print(f"Preprocessing training data with length {batch_length}\n") for row in batch.itertuples(): print(f"Batch {batch_count} - {count}/{batch_length}") # Preprocess song and append to audio list processed_song = audio_preprocessor(file=row.directory, skip_time=row.skip_time, train=True) audio_list.append(processed_song) # Preprocess lyric and append to lyric list processed_lyric = lyric_preprocessor(lyrics=row.lyrics) lyric_list.append(processed_lyric) count += 1 return audio_list, lyric_list def bulk_preprocessing_lyrics(batch: pd.DataFrame, batch_count: int): """ Applies lyrics preprocessing to a training batch Parameters ---------- batch : pd.dataframe Dataframe containing the batch data. batch_count : int Batch count value. Returns ------- lyric_list : list List of loaded lyrics in string form. """ lyric_preprocessor = LyricsPreprocessor() lyric_list = [] count, batch_length = 1, len(batch) print(f"Preprocessing training data with length {batch_length}\n") for row in batch.itertuples(): print(f"Batch {batch_count} - {count}/{batch_length}") # Preprocess lyric and append to lyric list processed_lyric = lyric_preprocessor(lyrics=row.lyrics) lyric_list.append(processed_lyric) count += 1 return lyric_list def single_preprocessing(audio, lyric: str): """ Preprocesses a single record of audio and lyric data Parameters ---------- audio : audio_object Audio object file lyric : string Lyric string Returns ------- processed_song : tensor Tensor version of the audio processed_lyric : string Lyric string """ # Instantiate preprocessor classes audio_preprocessor = AudioPreprocessor(script="predict") lyric_preprocessor = LyricsPreprocessor() # Preprocess both song and lyrics processed_song = audio_preprocessor(file=audio) processed_lyric = lyric_preprocessor(lyrics=lyric) return processed_song, processed_lyric def single_audio_preprocessing(audio): """ Preprocesses a single record of audio Parameters ---------- audio : audio_object Audio object file Returns ------- processed_song : tensor Tensor version of the audio """ # Instantiate preprocessor classes audio_preprocessor = AudioPreprocessor(script="predict") # Preprocess both song and lyrics processed_song = audio_preprocessor(file=audio) return processed_song def dataset_read(batch_size=20): """ Reads the main dataset, splits it into the train/test/valid split, and computes optimal number of samples per batch. Parameters ---------- batch_size : int Number of data per batch Returns ------- split: list[splits] A collection of the three splits split_lengths : list[int] List of the split lengths """ dataset = pd.read_csv(DATASET_CSV) train = dataset[dataset["split"] == "train"] test = dataset[dataset["split"] == "test"] val = dataset[dataset["split"] == "valid"] # Find the minimum split size (ignoring empty splits) min_split_size = min([len(train), len(test), len(val)]) # Clamp batch_size so it never exceeds the smallest split effective_batch_size = min(batch_size, min_split_size if min_split_size > 0 else batch_size) def make_splits(df, batch_size): if len(df) == 0: return [] n_splits = math.ceil(len(df) / batch_size) return np.array_split(df, n_splits) train_splits = make_splits(train, effective_batch_size) test_splits = make_splits(test, effective_batch_size) val_splits = make_splits(val, effective_batch_size) splits = [train_splits, test_splits, val_splits] split_lengths = [len(train), len(test), len(val)] return splits, split_lengths