Spaces:
Running
Running
File size: 2,703 Bytes
fc7b4a9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 |
import pandas as pd
import numpy as np
from src.preprocessing.audio_preprocessor import AudioPreprocessor
from src.preprocessing.lyrics_preprocessor import LyricsPreprocessor
from src.utils.config_loader import DATASET_CSV
def bulk_preprocessing(batch: pd.DataFrame, batch_count: int):
"""
Applies audio and lyrics preprocessing to a training batch
Parameters
----------
batch : pd.dataframe
Dataframe containing the batch data.
batch_count : int
Batch count value.
Returns
-------
audio_list : list
List of loaded audio in float form.
lyric_list : list
List of loaded lyrics in string form.
"""
audio_preprocessor = AudioPreprocessor(script="train")
lyric_preprocessor = LyricsPreprocessor()
audio_list, lyric_list = [], []
count, batch_length = 1, len(batch)
print(f"Preprocessing training data with length {batch_length}\n")
for row in batch.itertuples():
print(f"Batch {batch_count} - {count}/{batch_length}")
# Preprocess song and append to audio list
processed_song = audio_preprocessor(file=row.directory, skip_time=row.skip_time, train=True)
audio_list.append(processed_song)
# Preprocess lyric and append to lyric list
processed_lyric = lyric_preprocessor(lyrics=row.lyrics)
lyric_list.append(processed_lyric)
count += 1
return audio_list, lyric_list
def single_preprocessing(audio, lyric: str):
"""
Preprocesses a single record of audio and lyric data
Parameters
----------
audio : audio_object
Audio object file
lyric : string
Lyric string
Returns
-------
processed_song : tensor
Tensor version of the audio
processed_lyric : string
Lyric string
"""
# Instantiate preprocessor classes
audio_preprocessor = AudioPreprocessor(script="predict")
lyric_preprocessor = LyricsPreprocessor()
# Preprocess both song and lyrics
processed_song = audio_preprocessor(file=audio)
processed_lyric = lyric_preprocessor(lyrics=lyric)
return processed_song, processed_lyric
def dataset_read(batch_size = 20):
"""
Reads the csv file and returns batches of data
Parameters
----------
None
Returns
-------
data_splits : list
List of dataframes acting as batches
label : list
List of real/fake labels (in the formm of 0 and 1)
"""
dataset = pd.read_csv(DATASET_CSV)
label = dataset['target'].tolist()
# Split into x batches (50,000 / x)
data_splits = np.array_split(dataset, batch_size)
return data_splits, label
|