Spaces:
Running
Running
File size: 4,440 Bytes
fc7b4a9 75d43d2 fc7b4a9 75d43d2 fc7b4a9 75d43d2 fc7b4a9 75d43d2 fc7b4a9 75d43d2 fc7b4a9 75d43d2 fc7b4a9 75d43d2 fc7b4a9 75d43d2 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 |
import pandas as pd
import numpy as np
import math
from src.preprocessing.audio_preprocessor import AudioPreprocessor
from src.preprocessing.lyrics_preprocessor import LyricsPreprocessor
from src.utils.config_loader import DATASET_CSV
def bulk_preprocessing(batch: pd.DataFrame, batch_count: int):
"""
Applies audio and lyrics preprocessing to a training batch
Parameters
----------
batch : pd.dataframe
Dataframe containing the batch data.
batch_count : int
Batch count value.
Returns
-------
audio_list : list
List of loaded audio in float form.
lyric_list : list
List of loaded lyrics in string form.
"""
audio_preprocessor = AudioPreprocessor(script="train")
lyric_preprocessor = LyricsPreprocessor()
audio_list, lyric_list = [], []
count, batch_length = 1, len(batch)
print(f"Preprocessing training data with length {batch_length}\n")
for row in batch.itertuples():
print(f"Batch {batch_count} - {count}/{batch_length}")
# Preprocess song and append to audio list
processed_song = audio_preprocessor(file=row.directory, skip_time=row.skip_time, train=True)
audio_list.append(processed_song)
# Preprocess lyric and append to lyric list
processed_lyric = lyric_preprocessor(lyrics=row.lyrics)
lyric_list.append(processed_lyric)
count += 1
return audio_list, lyric_list
def bulk_preprocessing_lyrics(batch: pd.DataFrame, batch_count: int):
"""
Applies lyrics preprocessing to a training batch
Parameters
----------
batch : pd.dataframe
Dataframe containing the batch data.
batch_count : int
Batch count value.
Returns
-------
lyric_list : list
List of loaded lyrics in string form.
"""
lyric_preprocessor = LyricsPreprocessor()
lyric_list = []
count, batch_length = 1, len(batch)
print(f"Preprocessing training data with length {batch_length}\n")
for row in batch.itertuples():
print(f"Batch {batch_count} - {count}/{batch_length}")
# Preprocess lyric and append to lyric list
processed_lyric = lyric_preprocessor(lyrics=row.lyrics)
lyric_list.append(processed_lyric)
count += 1
return lyric_list
def single_preprocessing(audio, lyric: str):
"""
Preprocesses a single record of audio and lyric data
Parameters
----------
audio : audio_object
Audio object file
lyric : string
Lyric string
Returns
-------
processed_song : tensor
Tensor version of the audio
processed_lyric : string
Lyric string
"""
# Instantiate preprocessor classes
audio_preprocessor = AudioPreprocessor(script="predict")
lyric_preprocessor = LyricsPreprocessor()
# Preprocess both song and lyrics
processed_song = audio_preprocessor(file=audio)
processed_lyric = lyric_preprocessor(lyrics=lyric)
return processed_song, processed_lyric
def dataset_read(batch_size=20):
"""
Reads the main dataset, splits it into the train/test/valid split, and computes
optimal number of samples per batch.
Parameters
----------
batch_size : int
Number of data per batch
Returns
-------
split: list[splits]
A collection of the three splits
split_lengths : list[int]
List of the split lengths
"""
dataset = pd.read_csv(DATASET_CSV)
train = dataset[dataset["split"] == "train"]
test = dataset[dataset["split"] == "test"]
val = dataset[dataset["split"] == "valid"]
# Find the minimum split size (ignoring empty splits)
min_split_size = min([len(train), len(test), len(val)])
# Clamp batch_size so it never exceeds the smallest split
effective_batch_size = min(batch_size, min_split_size if min_split_size > 0 else batch_size)
def make_splits(df, batch_size):
if len(df) == 0:
return []
n_splits = math.ceil(len(df) / batch_size)
return np.array_split(df, n_splits)
train_splits = make_splits(train, effective_batch_size)
test_splits = make_splits(test, effective_batch_size)
val_splits = make_splits(val, effective_batch_size)
splits = [train_splits, test_splits, val_splits]
split_lengths = [len(train), len(test), len(val)]
return splits, split_lengths |