File size: 3,302 Bytes
fc7b4a9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

import joblib
import numpy as np
import logging

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)


def dataset_splitter(X: np.ndarray, Y: np.ndarray):
    """
    Script that splits the X and Y values to train, test, and valid splits.

    Parameters
    ----------
    X : np.array
        Array of feature vectors
    Y : np.array
        Array of labels (real or fake)

    Returns
    -------
    data : dict{np.array}
        A dictionary of np.arrays, containing the train/test/val split.
    """

    logger.info(f"Dataset shape: {X.shape}, Labels: {len(Y)}")
    logger.info(f"Class distribution: {np.bincount(Y)}")

    # Split the data into train/val/test
    X_train, X_test, y_train, y_test = train_test_split(
        X, Y, test_size=0.1, random_state=42, stratify=Y
    )
    
    X_train, X_val, y_train, y_val = train_test_split(
        X_train, y_train, test_size=0.2222, random_state=42, stratify=y_train
    )
    
    logger.info(f"Train: {X_train.shape}, Validation: {X_val.shape}, Test: {X_test.shape}")

    data = {
        "train": (X_train, y_train),
        "val": (X_val, y_val),
        "test": (X_test, y_test),
    }

    return data


def dataset_scaler(audio: np.ndarray, lyrics: np.ndarray):
    """
    Method to scale both audio and lyric vectors using Z-Score.
    This allows us to have both vectors with a mean of 0, and ranges up and down based on the
    standard deviation without compromising the information it contains.

    This also saves the scalers through joblib, which will be loaded in the predict script.

    Parameters
    ----------
    audio : np.array
        Array of audio features
    lyrics : np.array
        Array of lyric features

    Returns
    -------
    scaled_audio : np.array
        Array of scaled audio features
    scaled_lyrics : np.array
        Array of scaled lyric features
    """

    # Apply scalers to have similar-ranged data for both audio and lyrics training values
    audio_scaler = StandardScaler().fit(audio)
    lyric_scaler = StandardScaler().fit(lyrics)

    scaled_audio = audio_scaler.transform(audio)
    scaled_lyrics = lyric_scaler.transform(lyrics)

    # Save the trained scalers for prediction
    joblib.dump(audio_scaler, "models/fusion/audio_scaler.pkl")
    joblib.dump(lyric_scaler, "models/fusion/lyric_scaler.pkl")

    return scaled_audio, scaled_lyrics


def instance_scaler(audio: np.ndarray, lyrics: np.ndarray):
    """
    Method to scale the single input audio and lyrics

    Parameters
    ----------
    audio : np.array
        Instance of an audio feature
    lyrics : np.array
        Instance of a lyric feature

    Returns
    -------
    scaled_audio : np.array
        Array of scaled audio feature
    scaled_lyrics : np.array
        Array of scaled lyric feature
    """

    # Apply scalers to the single inputs
    audio_scaler = joblib.load("models/fusion/audio_scaler.pkl")
    lyric_scaler = joblib.load("models/fusion/lyric_scaler.pkl")

    scaled_audio = audio_scaler.transform([audio])
    scaled_lyrics = lyric_scaler.transform(lyrics)

    return scaled_audio, scaled_lyrics