bach-or-bot / src /preprocessing /preprocessor.py
krislette's picture
Auto-deploy from GitHub: cb4a769f21149a39309d7602af027f4cc33f773b
c84f2c4
import pandas as pd
import numpy as np
import math
from src.preprocessing.audio_preprocessor import AudioPreprocessor
from src.preprocessing.lyrics_preprocessor import LyricsPreprocessor
from src.utils.config_loader import DATASET_CSV
def bulk_preprocessing(batch: pd.DataFrame, batch_count: int):
"""
Applies audio and lyrics preprocessing to a training batch
Parameters
----------
batch : pd.dataframe
Dataframe containing the batch data.
batch_count : int
Batch count value.
Returns
-------
audio_list : list
List of loaded audio in float form.
lyric_list : list
List of loaded lyrics in string form.
"""
audio_preprocessor = AudioPreprocessor(script="train")
lyric_preprocessor = LyricsPreprocessor()
audio_list, lyric_list = [], []
count, batch_length = 1, len(batch)
print(f"Preprocessing training data with length {batch_length}\n")
for row in batch.itertuples():
print(f"Batch {batch_count} - {count}/{batch_length}")
# Preprocess song and append to audio list
processed_song = audio_preprocessor(file=row.directory, skip_time=row.skip_time, train=True)
audio_list.append(processed_song)
# Preprocess lyric and append to lyric list
processed_lyric = lyric_preprocessor(lyrics=row.lyrics)
lyric_list.append(processed_lyric)
count += 1
return audio_list, lyric_list
def bulk_preprocessing_lyrics(batch: pd.DataFrame, batch_count: int):
"""
Applies lyrics preprocessing to a training batch
Parameters
----------
batch : pd.dataframe
Dataframe containing the batch data.
batch_count : int
Batch count value.
Returns
-------
lyric_list : list
List of loaded lyrics in string form.
"""
lyric_preprocessor = LyricsPreprocessor()
lyric_list = []
count, batch_length = 1, len(batch)
print(f"Preprocessing training data with length {batch_length}\n")
for row in batch.itertuples():
print(f"Batch {batch_count} - {count}/{batch_length}")
# Preprocess lyric and append to lyric list
processed_lyric = lyric_preprocessor(lyrics=row.lyrics)
lyric_list.append(processed_lyric)
count += 1
return lyric_list
def single_preprocessing(audio, lyric: str):
"""
Preprocesses a single record of audio and lyric data
Parameters
----------
audio : audio_object
Audio object file
lyric : string
Lyric string
Returns
-------
processed_song : tensor
Tensor version of the audio
processed_lyric : string
Lyric string
"""
# Instantiate preprocessor classes
audio_preprocessor = AudioPreprocessor(script="predict")
lyric_preprocessor = LyricsPreprocessor()
# Preprocess both song and lyrics
processed_song = audio_preprocessor(file=audio)
processed_lyric = lyric_preprocessor(lyrics=lyric)
return processed_song, processed_lyric
def single_audio_preprocessing(audio):
"""
Preprocesses a single record of audio
Parameters
----------
audio : audio_object
Audio object file
Returns
-------
processed_song : tensor
Tensor version of the audio
"""
# Instantiate preprocessor classes
audio_preprocessor = AudioPreprocessor(script="predict")
# Preprocess both song and lyrics
processed_song = audio_preprocessor(file=audio)
return processed_song
def dataset_read(batch_size=20):
"""
Reads the main dataset, splits it into the train/test/valid split, and computes
optimal number of samples per batch.
Parameters
----------
batch_size : int
Number of data per batch
Returns
-------
split: list[splits]
A collection of the three splits
split_lengths : list[int]
List of the split lengths
"""
dataset = pd.read_csv(DATASET_CSV)
train = dataset[dataset["split"] == "train"]
test = dataset[dataset["split"] == "test"]
val = dataset[dataset["split"] == "valid"]
# Find the minimum split size (ignoring empty splits)
min_split_size = min([len(train), len(test), len(val)])
# Clamp batch_size so it never exceeds the smallest split
effective_batch_size = min(batch_size, min_split_size if min_split_size > 0 else batch_size)
def make_splits(df, batch_size):
if len(df) == 0:
return []
n_splits = math.ceil(len(df) / batch_size)
return np.array_split(df, n_splits)
train_splits = make_splits(train, effective_batch_size)
test_splits = make_splits(test, effective_batch_size)
val_splits = make_splits(val, effective_batch_size)
splits = [train_splits, test_splits, val_splits]
split_lengths = [len(train), len(test), len(val)]
return splits, split_lengths