File size: 2,703 Bytes
fc7b4a9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import pandas as pd
import numpy as np

from src.preprocessing.audio_preprocessor import AudioPreprocessor
from src.preprocessing.lyrics_preprocessor import LyricsPreprocessor
from src.utils.config_loader import DATASET_CSV


def bulk_preprocessing(batch: pd.DataFrame, batch_count: int):
    """
    Applies audio and lyrics preprocessing to a training batch

    Parameters
    ----------
    batch : pd.dataframe
        Dataframe containing the batch data.

    batch_count : int
        Batch count value.

    Returns
    -------
    audio_list : list
        List of loaded audio in float form.
    
    lyric_list : list
        List of loaded lyrics in string form.
    """

    audio_preprocessor = AudioPreprocessor(script="train")
    lyric_preprocessor = LyricsPreprocessor()

    audio_list, lyric_list = [], []
    count, batch_length = 1, len(batch)

    print(f"Preprocessing training data with length {batch_length}\n")

    for row in batch.itertuples():
        print(f"Batch {batch_count}     -    {count}/{batch_length}")

        # Preprocess song and append to audio list
        processed_song = audio_preprocessor(file=row.directory, skip_time=row.skip_time, train=True)
        audio_list.append(processed_song)

        # Preprocess lyric and append to lyric list
        processed_lyric = lyric_preprocessor(lyrics=row.lyrics)
        lyric_list.append(processed_lyric)

        count += 1

    return audio_list, lyric_list


def single_preprocessing(audio, lyric: str):
    """
    Preprocesses a single record of audio and lyric data

    Parameters
    ----------
    audio : audio_object
        Audio object file
    
    lyric : string
        Lyric string

    Returns
    -------
    processed_song : tensor
        Tensor version of the audio
    
    processed_lyric : string
        Lyric string
    """
    # Instantiate preprocessor classes
    audio_preprocessor = AudioPreprocessor(script="predict")
    lyric_preprocessor = LyricsPreprocessor()

    # Preprocess both song and lyrics
    processed_song = audio_preprocessor(file=audio)
    processed_lyric = lyric_preprocessor(lyrics=lyric)

    return processed_song, processed_lyric


def dataset_read(batch_size = 20):
    """
    Reads the csv file and returns batches of data

    Parameters
    ----------
    None

    Returns
    -------
    data_splits : list
        List of dataframes acting as batches
    
    label : list
        List of real/fake labels (in the formm of 0 and 1)
    """
    dataset = pd.read_csv(DATASET_CSV)
    label = dataset['target'].tolist()

    # Split into x batches (50,000 / x)
    data_splits = np.array_split(dataset, batch_size)

    return data_splits, label