import pandas as pd import numpy as np from src.preprocessing.audio_preprocessor import AudioPreprocessor from src.preprocessing.lyrics_preprocessor import LyricsPreprocessor from src.utils.config_loader import DATASET_CSV def bulk_preprocessing(batch: pd.DataFrame, batch_count: int): """ Applies audio and lyrics preprocessing to a training batch Parameters ---------- batch : pd.dataframe Dataframe containing the batch data. batch_count : int Batch count value. Returns ------- audio_list : list List of loaded audio in float form. lyric_list : list List of loaded lyrics in string form. """ audio_preprocessor = AudioPreprocessor(script="train") lyric_preprocessor = LyricsPreprocessor() audio_list, lyric_list = [], [] count, batch_length = 1, len(batch) print(f"Preprocessing training data with length {batch_length}\n") for row in batch.itertuples(): print(f"Batch {batch_count} - {count}/{batch_length}") # Preprocess song and append to audio list processed_song = audio_preprocessor(file=row.directory, skip_time=row.skip_time, train=True) audio_list.append(processed_song) # Preprocess lyric and append to lyric list processed_lyric = lyric_preprocessor(lyrics=row.lyrics) lyric_list.append(processed_lyric) count += 1 return audio_list, lyric_list def single_preprocessing(audio, lyric: str): """ Preprocesses a single record of audio and lyric data Parameters ---------- audio : audio_object Audio object file lyric : string Lyric string Returns ------- processed_song : tensor Tensor version of the audio processed_lyric : string Lyric string """ # Instantiate preprocessor classes audio_preprocessor = AudioPreprocessor(script="predict") lyric_preprocessor = LyricsPreprocessor() # Preprocess both song and lyrics processed_song = audio_preprocessor(file=audio) processed_lyric = lyric_preprocessor(lyrics=lyric) return processed_song, processed_lyric def dataset_read(batch_size = 20): """ Reads the csv file and returns batches of data Parameters ---------- None Returns ------- data_splits : list List of dataframes acting as batches label : list List of real/fake labels (in the formm of 0 and 1) """ dataset = pd.read_csv(DATASET_CSV) label = dataset['target'].tolist() # Split into x batches (50,000 / x) data_splits = np.array_split(dataset, batch_size) return data_splits, label