cap_main_1 / train.py
vikranth1111's picture
Upload 18 files
89ba7b7
import os
import pickle
import numpy as np
import pandas as pd
from sklearn.metrics import f1_score
from tensorflow.keras.layers import Embedding
from sklearn.model_selection import StratifiedKFold
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import config
import preprocessing as pp
import features as f
import data_cleaning as data_clean
from lstm_model import my_LSTM
# GPU Use
os.environ["KERAS_BACKEND"] = "plaidml.keras.backend"
def run_training(model:str) -> None:
"""
Training our Machine Learning model and serializing to disc
"""
# read train and test data
df_train = pd.read_csv(config.ORIGINAL_TRAIN)
df_test = pd.read_csv(config.TEST_DATA)
# relabel mislabeled samples
df_train = data_clean.relabel_target(df_train)
# shuffle data
df_train = df_train.sample(frac=1, random_state=42).reset_index(drop=True)
# clean the text
df_train[config.CLEANED_TEXT] = df_train[config.TEXT].apply(pp.clean_tweet)
df_test[config.CLEANED_TEXT] = df_test[config.TEXT].apply(pp.clean_tweet)
# save the modified train and test data
df_train.to_csv(config.MODIFIED_TRAIN, index=False)
df_test.to_csv(config.MODIFIED_TEST, index=False)
del df_test
# convert text to numerical representation
tokenizer = Tokenizer(oov_token="<unk>")
tokenizer.fit_on_texts(df_train[config.CLEANED_TEXT])
# path to save model
model_path = f"{config.MODEL_DIR}/PRETRAIN_WORD2VEC_{model}/"
# checking the folder exist
if not os.path.exists(model_path):
os.makedirs(model_path)
# saving tokenizer
with open(f'{model_path}tokenizer.pkl', 'wb') as handle:
pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)
# pad the sequences
X_padded = pad_sequences(tokenizer.texts_to_sequences(df_train[config.CLEANED_TEXT].values), maxlen=config.MAXLEN)
# get the pretrained word embeddings and prepare embedding layer
embedding_matrix = f.get_word2vec_enc(tokenizer.word_index.items(), config.PRETRAINED_WORD2VEC)
embedding_layer = Embedding(input_dim=config.VOCAB_SIZE,
output_dim=config.EMBED_SIZE,
weights=[embedding_matrix],
input_length=config.MAXLEN,
trainable=False)
# target values
y = df_train[config.RELABELED_TARGET].values
# train a single model
clf = my_LSTM(embedding_layer)
clf.fit(X_padded, y,
epochs=config.N_EPOCHS,
verbose=1)
# persist the model
clf.save(f"{model_path}/{model}_Word2Vec.h5")
if __name__ == "__main__":
run_training("LSTM")