Spaces:

krislette
/

bach-or-bot

Running

bach-or-bot / src /preprocessing /preprocessor.py

Acelle Krislette Rosales

Initial commit: Added application code

fc7b4a9 2 months ago

2.7 kB

	import pandas as pd
	import numpy as np

	from src.preprocessing.audio_preprocessor import AudioPreprocessor
	from src.preprocessing.lyrics_preprocessor import LyricsPreprocessor
	from src.utils.config_loader import DATASET_CSV


	def bulk_preprocessing(batch: pd.DataFrame, batch_count: int):
	"""
	Applies audio and lyrics preprocessing to a training batch

	Parameters
	----------
	batch : pd.dataframe
	Dataframe containing the batch data.

	batch_count : int
	Batch count value.

	Returns
	-------
	audio_list : list
	List of loaded audio in float form.

	lyric_list : list
	List of loaded lyrics in string form.
	"""

	audio_preprocessor = AudioPreprocessor(script="train")
	lyric_preprocessor = LyricsPreprocessor()

	audio_list, lyric_list = [], []
	count, batch_length = 1, len(batch)

	print(f"Preprocessing training data with length {batch_length}\n")

	for row in batch.itertuples():
	print(f"Batch {batch_count} - {count}/{batch_length}")

	# Preprocess song and append to audio list
	processed_song = audio_preprocessor(file=row.directory, skip_time=row.skip_time, train=True)
	audio_list.append(processed_song)

	# Preprocess lyric and append to lyric list
	processed_lyric = lyric_preprocessor(lyrics=row.lyrics)
	lyric_list.append(processed_lyric)

	count += 1

	return audio_list, lyric_list


	def single_preprocessing(audio, lyric: str):
	"""
	Preprocesses a single record of audio and lyric data

	Parameters
	----------
	audio : audio_object
	Audio object file

	lyric : string
	Lyric string

	Returns
	-------
	processed_song : tensor
	Tensor version of the audio

	processed_lyric : string
	Lyric string
	"""
	# Instantiate preprocessor classes
	audio_preprocessor = AudioPreprocessor(script="predict")
	lyric_preprocessor = LyricsPreprocessor()

	# Preprocess both song and lyrics
	processed_song = audio_preprocessor(file=audio)
	processed_lyric = lyric_preprocessor(lyrics=lyric)

	return processed_song, processed_lyric


	def dataset_read(batch_size = 20):
	"""
	Reads the csv file and returns batches of data

	Parameters
	----------
	None

	Returns
	-------
	data_splits : list
	List of dataframes acting as batches

	label : list
	List of real/fake labels (in the formm of 0 and 1)
	"""
	dataset = pd.read_csv(DATASET_CSV)
	label = dataset['target'].tolist()

	# Split into x batches (50,000 / x)
	data_splits = np.array_split(dataset, batch_size)

	return data_splits, label