Spaces:

krislette
/

bach-or-bot

Sleeping

App Files Files Community

bach-or-bot / src /preprocessing /preprocessor.py

krislette

Auto-deploy from GitHub: cb4a769f21149a39309d7602af027f4cc33f773b

c84f2c4 about 2 months ago

raw

history blame contribute delete

4.93 kB

	import pandas as pd
	import numpy as np
	import math

	from src.preprocessing.audio_preprocessor import AudioPreprocessor
	from src.preprocessing.lyrics_preprocessor import LyricsPreprocessor
	from src.utils.config_loader import DATASET_CSV


	def bulk_preprocessing(batch: pd.DataFrame, batch_count: int):
	"""
	Applies audio and lyrics preprocessing to a training batch

	Parameters
	----------
	batch : pd.dataframe
	Dataframe containing the batch data.

	batch_count : int
	Batch count value.

	Returns
	-------
	audio_list : list
	List of loaded audio in float form.

	lyric_list : list
	List of loaded lyrics in string form.
	"""

	audio_preprocessor = AudioPreprocessor(script="train")
	lyric_preprocessor = LyricsPreprocessor()

	audio_list, lyric_list = [], []
	count, batch_length = 1, len(batch)

	print(f"Preprocessing training data with length {batch_length}\n")

	for row in batch.itertuples():
	print(f"Batch {batch_count} - {count}/{batch_length}")

	# Preprocess song and append to audio list
	processed_song = audio_preprocessor(file=row.directory, skip_time=row.skip_time, train=True)
	audio_list.append(processed_song)

	# Preprocess lyric and append to lyric list
	processed_lyric = lyric_preprocessor(lyrics=row.lyrics)
	lyric_list.append(processed_lyric)

	count += 1

	return audio_list, lyric_list


	def bulk_preprocessing_lyrics(batch: pd.DataFrame, batch_count: int):
	"""
	Applies lyrics preprocessing to a training batch

	Parameters
	----------
	batch : pd.dataframe
	Dataframe containing the batch data.

	batch_count : int
	Batch count value.

	Returns
	-------
	lyric_list : list
	List of loaded lyrics in string form.
	"""

	lyric_preprocessor = LyricsPreprocessor()

	lyric_list = []
	count, batch_length = 1, len(batch)

	print(f"Preprocessing training data with length {batch_length}\n")

	for row in batch.itertuples():
	print(f"Batch {batch_count} - {count}/{batch_length}")

	# Preprocess lyric and append to lyric list
	processed_lyric = lyric_preprocessor(lyrics=row.lyrics)
	lyric_list.append(processed_lyric)

	count += 1

	return lyric_list


	def single_preprocessing(audio, lyric: str):
	"""
	Preprocesses a single record of audio and lyric data

	Parameters
	----------
	audio : audio_object
	Audio object file

	lyric : string
	Lyric string

	Returns
	-------
	processed_song : tensor
	Tensor version of the audio

	processed_lyric : string
	Lyric string
	"""
	# Instantiate preprocessor classes
	audio_preprocessor = AudioPreprocessor(script="predict")
	lyric_preprocessor = LyricsPreprocessor()

	# Preprocess both song and lyrics
	processed_song = audio_preprocessor(file=audio)
	processed_lyric = lyric_preprocessor(lyrics=lyric)

	return processed_song, processed_lyric


	def single_audio_preprocessing(audio):
	"""
	Preprocesses a single record of audio

	Parameters
	----------
	audio : audio_object
	Audio object file

	Returns
	-------
	processed_song : tensor
	Tensor version of the audio

	"""
	# Instantiate preprocessor classes
	audio_preprocessor = AudioPreprocessor(script="predict")

	# Preprocess both song and lyrics
	processed_song = audio_preprocessor(file=audio)

	return processed_song


	def dataset_read(batch_size=20):
	"""
	Reads the main dataset, splits it into the train/test/valid split, and computes
	optimal number of samples per batch.

	Parameters
	----------
	batch_size : int
	Number of data per batch

	Returns
	-------
	split: list[splits]
	A collection of the three splits

	split_lengths : list[int]
	List of the split lengths
	"""
	dataset = pd.read_csv(DATASET_CSV)

	train = dataset[dataset["split"] == "train"]
	test = dataset[dataset["split"] == "test"]
	val = dataset[dataset["split"] == "valid"]

	# Find the minimum split size (ignoring empty splits)
	min_split_size = min([len(train), len(test), len(val)])
	# Clamp batch_size so it never exceeds the smallest split
	effective_batch_size = min(batch_size, min_split_size if min_split_size > 0 else batch_size)

	def make_splits(df, batch_size):
	if len(df) == 0:
	return []
	n_splits = math.ceil(len(df) / batch_size)
	return np.array_split(df, n_splits)

	train_splits = make_splits(train, effective_batch_size)
	test_splits = make_splits(test, effective_batch_size)
	val_splits = make_splits(val, effective_batch_size)

	splits = [train_splits, test_splits, val_splits]
	split_lengths = [len(train), len(test), len(val)]

	return splits, split_lengths