File size: 7,726 Bytes
6fc89a8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 |
# initialize tf/kera and/or whatver else you need here
import os as os
import pandas as pd
import numpy as np
import tensorflow as tf
tf.config.list_physical_devices("GPU")
from tensorflow.python import keras
from keras.layers import LSTM, Bidirectional
from keras.layers import Dense, Embedding
from keras.models import Sequential
from keras.preprocessing import sequence
import string
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from keras.utils import to_categorical
#initializing to run the file on GPU (specify number to run one file in one gpu)
gpus = tf.config.list_physical_devices('GPU')
if gpus:
# Restrict TensorFlow to only use the 3rd GPU
try:
tf.config.set_visible_devices(gpus[3], 'GPU')
logical_gpus = tf.config.list_logical_devices('GPU')
print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
except RuntimeError as e:
# Visible devices must be set before GPUs have been initialized
print(e)
# create ASCII dictionary
chars = ['E'] + [chr(i) for i in range(97,123)] + [' ', 'U']
id2char = {i:j for i,j in enumerate(chars)}
char2id = {j:i for i,j in enumerate(chars)}
def name2id(name, l = 10):
ids = [0] * l
for i, c in enumerate(name):
if i < l:
if c.isalpha():
ids[i] = char2id.get(c, char2id['U'])
elif c in string.punctuation:
ids[i] = char2id.get(c, char2id[' '])
else:
ids[i] = char2id.get(c, char2id['U'])
return ids
os.chdir('/home/lstm/expermiment_data')
# directory containing experiment dataset files
data_directory = os.listdir('/home/lstm/expermiment_data')
#getting a list of data files in the directorys
dataFiles = [file for file in data_directory if file.endswith('.csv')]
print("experiment files available: ", dataFiles)
# ----- Any Pre-Processing goes here ----
# ---- Create a file that captures the stats as they come off the models ----
# ------ Now Loop over Data Expertiment files capturing stats and models as you go -----
#for now hardcoding each file
dataFiles = ["minorMulti_50.csv"]
dFile=dataFiles[0]
for dFile in dataFiles:
#copy the experiment file to test directory with the new name
fileOut=["/home/lstm/test_results/", dFile.split(".")[0],"_testLr3_5.csv"]
fileOut = "".join(fileOut)
# create model file name by stripping input data file name
modelOut = ["/home/lstm/models/", dFile.split(".")[0], "_Lr3_5.h5"] # tracked name lengths in file name
modelOut = "".join(modelOut)
#for validate set
vFile = ["/home/lstm/validate_results/", dFile.split(".")[0],"_validateLr3_5.csv"]
vFile = "".join(vFile)
# Build data file input name
dFile = ["/home/lstm/expermiment_data/", dFile] #already reading in near data directory when prompting user
dFile = "".join(dFile)
# read in data and drop incomplete cases (JIC)
df = pd.read_csv(dFile)
df = df.dropna()
df = df.astype({"lname" : str,"fname" : str})
#sampling
#df = df.sample(frac=0.1)
# Convert to numeric representation
#getting the data into train (folds > 2), test (fold =1) and validate (folds=2)
X_train = [name2id(fn.lower(), l=12) + name2id(ln.lower(), l=16)
for fn, ln in zip(df['fname'][df['folds']>2], df['lname'][df['folds']>2])]
y_train = [int(i) for i in df['ethnicity'][df['folds']>2].tolist()]
X_test = [name2id(fn.lower(), l=12) + name2id(ln.lower(), l=16)
for fn, ln in zip(df['fname'][df['folds']==1], df['lname'][df['folds']==1])]
y_test = [int(i) for i in df['ethnicity'][df['folds']==1].tolist()]
X_validate = [name2id(fn.lower(), l=12) + name2id(ln.lower(), l=16)
for fn, ln in zip(df['fname'][df['folds']==2], df['lname'][df['folds']==2])]
y_validate = [int(i) for i in df['ethnicity'][df['folds']==2].tolist()]
# cut texts after this number of words (among top max_features most common words)
num_words = len(id2char)
feature_len = 28
batch_size = 512
print(len(X_train), 'train sequences')
print(len(X_test), 'test sequences')
print(len(X_validate), 'validate sequences')
print('Pad sequences (samples x time)')
X_train = sequence.pad_sequences(X_train, maxlen=feature_len)
X_test = sequence.pad_sequences(X_test, maxlen=feature_len)
X_validate=sequence.pad_sequences(X_validate, maxlen=feature_len)
print('X_train shape:', X_train.shape)
print('X_test shape:', X_test.shape)
print('X_validate shape:', X_test.shape)
num_classes = 2 # np.max(y_train) + 1
print(num_classes, 'classes')
print('Convert class vector to binary class matrix '
'(for use with categorical_crossentropy)')
y_train = to_categorical(y_train, num_classes)
y_test = to_categorical(y_test, num_classes)
y_validate = to_categorical(y_validate, num_classes)
print('y_train shape:', y_train.shape)
print('y_test shape:', y_test.shape)
print('y_validate shape:', y_validate.shape)
# simple train-test
# first build
model = Sequential()
model.add(Embedding(num_words, 256, input_length=feature_len))
# try out bi-directional LSTM
model.add(Bidirectional(LSTM(512, return_sequences=True, dropout=0.2)))
model.add(Bidirectional(LSTM(512, return_sequences=True, dropout=0.2)))
model.add(Bidirectional(LSTM(512, return_sequences=True, dropout=0.2)))
model.add(Bidirectional(LSTM(512, dropout=0.2)))
model.add(Dense(num_classes, activation='softmax'))
# choose between learning rates
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=10 ** -3.5), #earlier learning rate was 1^-3
loss='categorical_crossentropy',
metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])
callback = tf.keras.callbacks.EarlyStopping(
mode='min', monitor='loss', patience=1, min_delta=.001) #earlier delta was 0.0015
# train model
model.fit(X_train, y_train, batch_size=batch_size, epochs=10,
validation_data=(X_test, y_test), verbose=1, callbacks=[callback]) #increasing the epoch limit to 15, ealrier it was 10
# score, acc = model.evaluate(
# X_test, y_test, batch_size=batch_size, verbose=1)
# get predicitions on test data
y_pred = model.predict(X_test, batch_size=batch_size, verbose=1)
y_pred_bool = np.argmax(y_pred, axis=1)
# print captured performance versus test set
print("test results", file=open(fileOut, "a"))
print(classification_report(np.argmax(y_test, axis=1),
y_pred_bool), file=open(fileOut, "a"))
print(confusion_matrix(np.argmax(y_test, axis=1),
y_pred_bool), file=open(fileOut, "a"))
# get predicitions on validate data
y_pred_validate = model.predict(X_validate, batch_size=batch_size, verbose=1)
y_pred_bool_validate = np.argmax(y_pred_validate, axis=1)
# print captured performance versus validate set
print("validate results", file=open(fileOut, "a"))
print(classification_report(np.argmax(y_validate, axis=1),
y_pred_bool_validate), file=open(fileOut, "a"))
print(confusion_matrix(np.argmax(y_validate, axis=1),
y_pred_bool_validate), file=open(fileOut, "a"))
# save model and move to next
model.save(modelOut, include_optimizer=False)
#create a dataframe for validate
data2 = {'Predict': y_pred_bool_validate, 'Actual':np.argmax(y_validate, axis=1), 'PredictScore':y_pred_validate[:,1]}
df_data_val2 = pd.DataFrame(data2)
df_data_val2.to_csv(vFile, index=False)
|