File size: 2,753 Bytes
9e2ba5f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import os
import pickle

import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from tensorflow.keras.layers import Embedding
from sklearn.model_selection import StratifiedKFold
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

import config
import preprocessing as pp
import features as f
import data_cleaning as data_clean
from lstm_model import my_LSTM

# GPU Use
os.environ["KERAS_BACKEND"] = "plaidml.keras.backend"

def run_training(model:str) -> None:
    """
    Training our Machine Learning model and serializing to disc
    """
    # read train and test data
    df_train = pd.read_csv(config.ORIGINAL_TRAIN)
    df_test = pd.read_csv(config.TEST_DATA)

    # relabel mislabeled samples
    df_train = data_clean.relabel_target(df_train)

    # shuffle data
    df_train = df_train.sample(frac=1, random_state=42).reset_index(drop=True)

    # clean the text
    df_train[config.CLEANED_TEXT] = df_train[config.TEXT].apply(pp.clean_tweet)
    df_test[config.CLEANED_TEXT] = df_test[config.TEXT].apply(pp.clean_tweet)

    # save the modified train and test data
    df_train.to_csv(config.MODIFIED_TRAIN, index=False)
    df_test.to_csv(config.MODIFIED_TEST, index=False)
    del df_test

    # convert text to numerical representation
    tokenizer = Tokenizer(oov_token="<unk>")
    tokenizer.fit_on_texts(df_train[config.CLEANED_TEXT])

    # path to save model
    model_path = f"{config.MODEL_DIR}/PRETRAIN_WORD2VEC_{model}/"

    # checking the folder exist
    if not os.path.exists(model_path):
        os.makedirs(model_path)

    # saving tokenizer
    with open(f'{model_path}tokenizer.pkl', 'wb') as handle:
        pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

    # pad the sequences
    X_padded = pad_sequences(tokenizer.texts_to_sequences(df_train[config.CLEANED_TEXT].values), maxlen=config.MAXLEN)

    # get the pretrained word embeddings and prepare embedding layer
    embedding_matrix = f.get_word2vec_enc(tokenizer.word_index.items(), config.PRETRAINED_WORD2VEC)
    embedding_layer = Embedding(input_dim=config.VOCAB_SIZE,
                                output_dim=config.EMBED_SIZE,
                                weights=[embedding_matrix],
                                input_length=config.MAXLEN,
                                trainable=False)

    # target values
    y = df_train[config.RELABELED_TARGET].values

    # train a single model
    clf = my_LSTM(embedding_layer)
    clf.fit(X_padded, y,
            epochs=config.N_EPOCHS,
            verbose=1)

    # persist the model
    clf.save(f"{model_path}/{model}_Word2Vec.h5")

if __name__ == "__main__":
    run_training("LSTM")