Commit
·
6fc89a8
1
Parent(s):
eda8b78
Upload hispMaxDupFullData_fit.py
Browse files- hispMaxDupFullData_fit.py +189 -0
hispMaxDupFullData_fit.py
ADDED
|
@@ -0,0 +1,189 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
# initialize tf/kera and/or whatver else you need here
|
| 2 |
+
import os as os
|
| 3 |
+
import pandas as pd
|
| 4 |
+
import numpy as np
|
| 5 |
+
import tensorflow as tf
|
| 6 |
+
tf.config.list_physical_devices("GPU")
|
| 7 |
+
from tensorflow.python import keras
|
| 8 |
+
from keras.layers import LSTM, Bidirectional
|
| 9 |
+
from keras.layers import Dense, Embedding
|
| 10 |
+
from keras.models import Sequential
|
| 11 |
+
from keras.preprocessing import sequence
|
| 12 |
+
import string
|
| 13 |
+
from sklearn.metrics import classification_report, confusion_matrix
|
| 14 |
+
from sklearn.model_selection import train_test_split
|
| 15 |
+
from sklearn.feature_extraction.text import CountVectorizer
|
| 16 |
+
from keras.utils import to_categorical
|
| 17 |
+
|
| 18 |
+
|
| 19 |
+
#initializing to run the file on GPU (specify number to run one file in one gpu)
|
| 20 |
+
gpus = tf.config.list_physical_devices('GPU')
|
| 21 |
+
if gpus:
|
| 22 |
+
# Restrict TensorFlow to only use the 3rd GPU
|
| 23 |
+
try:
|
| 24 |
+
tf.config.set_visible_devices(gpus[3], 'GPU')
|
| 25 |
+
logical_gpus = tf.config.list_logical_devices('GPU')
|
| 26 |
+
print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
|
| 27 |
+
except RuntimeError as e:
|
| 28 |
+
# Visible devices must be set before GPUs have been initialized
|
| 29 |
+
print(e)
|
| 30 |
+
|
| 31 |
+
# create ASCII dictionary
|
| 32 |
+
chars = ['E'] + [chr(i) for i in range(97,123)] + [' ', 'U']
|
| 33 |
+
id2char = {i:j for i,j in enumerate(chars)}
|
| 34 |
+
char2id = {j:i for i,j in enumerate(chars)}
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
def name2id(name, l = 10):
|
| 38 |
+
ids = [0] * l
|
| 39 |
+
for i, c in enumerate(name):
|
| 40 |
+
if i < l:
|
| 41 |
+
if c.isalpha():
|
| 42 |
+
ids[i] = char2id.get(c, char2id['U'])
|
| 43 |
+
elif c in string.punctuation:
|
| 44 |
+
ids[i] = char2id.get(c, char2id[' '])
|
| 45 |
+
else:
|
| 46 |
+
ids[i] = char2id.get(c, char2id['U'])
|
| 47 |
+
return ids
|
| 48 |
+
|
| 49 |
+
|
| 50 |
+
os.chdir('/home/lstm/expermiment_data')
|
| 51 |
+
# directory containing experiment dataset files
|
| 52 |
+
data_directory = os.listdir('/home/lstm/expermiment_data')
|
| 53 |
+
|
| 54 |
+
|
| 55 |
+
#getting a list of data files in the directorys
|
| 56 |
+
dataFiles = [file for file in data_directory if file.endswith('.csv')]
|
| 57 |
+
print("experiment files available: ", dataFiles)
|
| 58 |
+
|
| 59 |
+
# ----- Any Pre-Processing goes here ----
|
| 60 |
+
|
| 61 |
+
# ---- Create a file that captures the stats as they come off the models ----
|
| 62 |
+
# ------ Now Loop over Data Expertiment files capturing stats and models as you go -----
|
| 63 |
+
#for now hardcoding each file
|
| 64 |
+
dataFiles = ["minorMulti_50.csv"]
|
| 65 |
+
dFile=dataFiles[0]
|
| 66 |
+
for dFile in dataFiles:
|
| 67 |
+
|
| 68 |
+
#copy the experiment file to test directory with the new name
|
| 69 |
+
fileOut=["/home/lstm/test_results/", dFile.split(".")[0],"_testLr3_5.csv"]
|
| 70 |
+
fileOut = "".join(fileOut)
|
| 71 |
+
# create model file name by stripping input data file name
|
| 72 |
+
modelOut = ["/home/lstm/models/", dFile.split(".")[0], "_Lr3_5.h5"] # tracked name lengths in file name
|
| 73 |
+
modelOut = "".join(modelOut)
|
| 74 |
+
#for validate set
|
| 75 |
+
vFile = ["/home/lstm/validate_results/", dFile.split(".")[0],"_validateLr3_5.csv"]
|
| 76 |
+
vFile = "".join(vFile)
|
| 77 |
+
# Build data file input name
|
| 78 |
+
dFile = ["/home/lstm/expermiment_data/", dFile] #already reading in near data directory when prompting user
|
| 79 |
+
dFile = "".join(dFile)
|
| 80 |
+
# read in data and drop incomplete cases (JIC)
|
| 81 |
+
df = pd.read_csv(dFile)
|
| 82 |
+
df = df.dropna()
|
| 83 |
+
df = df.astype({"lname" : str,"fname" : str})
|
| 84 |
+
#sampling
|
| 85 |
+
#df = df.sample(frac=0.1)
|
| 86 |
+
|
| 87 |
+
# Convert to numeric representation
|
| 88 |
+
#getting the data into train (folds > 2), test (fold =1) and validate (folds=2)
|
| 89 |
+
X_train = [name2id(fn.lower(), l=12) + name2id(ln.lower(), l=16)
|
| 90 |
+
for fn, ln in zip(df['fname'][df['folds']>2], df['lname'][df['folds']>2])]
|
| 91 |
+
y_train = [int(i) for i in df['ethnicity'][df['folds']>2].tolist()]
|
| 92 |
+
|
| 93 |
+
X_test = [name2id(fn.lower(), l=12) + name2id(ln.lower(), l=16)
|
| 94 |
+
for fn, ln in zip(df['fname'][df['folds']==1], df['lname'][df['folds']==1])]
|
| 95 |
+
y_test = [int(i) for i in df['ethnicity'][df['folds']==1].tolist()]
|
| 96 |
+
|
| 97 |
+
X_validate = [name2id(fn.lower(), l=12) + name2id(ln.lower(), l=16)
|
| 98 |
+
for fn, ln in zip(df['fname'][df['folds']==2], df['lname'][df['folds']==2])]
|
| 99 |
+
y_validate = [int(i) for i in df['ethnicity'][df['folds']==2].tolist()]
|
| 100 |
+
|
| 101 |
+
|
| 102 |
+
# cut texts after this number of words (among top max_features most common words)
|
| 103 |
+
num_words = len(id2char)
|
| 104 |
+
feature_len = 28
|
| 105 |
+
batch_size = 512
|
| 106 |
+
|
| 107 |
+
print(len(X_train), 'train sequences')
|
| 108 |
+
print(len(X_test), 'test sequences')
|
| 109 |
+
print(len(X_validate), 'validate sequences')
|
| 110 |
+
|
| 111 |
+
print('Pad sequences (samples x time)')
|
| 112 |
+
X_train = sequence.pad_sequences(X_train, maxlen=feature_len)
|
| 113 |
+
X_test = sequence.pad_sequences(X_test, maxlen=feature_len)
|
| 114 |
+
X_validate=sequence.pad_sequences(X_validate, maxlen=feature_len)
|
| 115 |
+
print('X_train shape:', X_train.shape)
|
| 116 |
+
print('X_test shape:', X_test.shape)
|
| 117 |
+
print('X_validate shape:', X_test.shape)
|
| 118 |
+
|
| 119 |
+
|
| 120 |
+
num_classes = 2 # np.max(y_train) + 1
|
| 121 |
+
print(num_classes, 'classes')
|
| 122 |
+
|
| 123 |
+
print('Convert class vector to binary class matrix '
|
| 124 |
+
'(for use with categorical_crossentropy)')
|
| 125 |
+
y_train = to_categorical(y_train, num_classes)
|
| 126 |
+
y_test = to_categorical(y_test, num_classes)
|
| 127 |
+
y_validate = to_categorical(y_validate, num_classes)
|
| 128 |
+
|
| 129 |
+
print('y_train shape:', y_train.shape)
|
| 130 |
+
print('y_test shape:', y_test.shape)
|
| 131 |
+
print('y_validate shape:', y_validate.shape)
|
| 132 |
+
|
| 133 |
+
# simple train-test
|
| 134 |
+
# first build
|
| 135 |
+
model = Sequential()
|
| 136 |
+
model.add(Embedding(num_words, 256, input_length=feature_len))
|
| 137 |
+
# try out bi-directional LSTM
|
| 138 |
+
model.add(Bidirectional(LSTM(512, return_sequences=True, dropout=0.2)))
|
| 139 |
+
model.add(Bidirectional(LSTM(512, return_sequences=True, dropout=0.2)))
|
| 140 |
+
model.add(Bidirectional(LSTM(512, return_sequences=True, dropout=0.2)))
|
| 141 |
+
model.add(Bidirectional(LSTM(512, dropout=0.2)))
|
| 142 |
+
model.add(Dense(num_classes, activation='softmax'))
|
| 143 |
+
|
| 144 |
+
# choose between learning rates
|
| 145 |
+
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=10 ** -3.5), #earlier learning rate was 1^-3
|
| 146 |
+
loss='categorical_crossentropy',
|
| 147 |
+
metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])
|
| 148 |
+
|
| 149 |
+
callback = tf.keras.callbacks.EarlyStopping(
|
| 150 |
+
mode='min', monitor='loss', patience=1, min_delta=.001) #earlier delta was 0.0015
|
| 151 |
+
|
| 152 |
+
# train model
|
| 153 |
+
model.fit(X_train, y_train, batch_size=batch_size, epochs=10,
|
| 154 |
+
validation_data=(X_test, y_test), verbose=1, callbacks=[callback]) #increasing the epoch limit to 15, ealrier it was 10
|
| 155 |
+
|
| 156 |
+
# score, acc = model.evaluate(
|
| 157 |
+
# X_test, y_test, batch_size=batch_size, verbose=1)
|
| 158 |
+
|
| 159 |
+
# get predicitions on test data
|
| 160 |
+
y_pred = model.predict(X_test, batch_size=batch_size, verbose=1)
|
| 161 |
+
y_pred_bool = np.argmax(y_pred, axis=1)
|
| 162 |
+
|
| 163 |
+
# print captured performance versus test set
|
| 164 |
+
print("test results", file=open(fileOut, "a"))
|
| 165 |
+
print(classification_report(np.argmax(y_test, axis=1),
|
| 166 |
+
y_pred_bool), file=open(fileOut, "a"))
|
| 167 |
+
print(confusion_matrix(np.argmax(y_test, axis=1),
|
| 168 |
+
y_pred_bool), file=open(fileOut, "a"))
|
| 169 |
+
|
| 170 |
+
# get predicitions on validate data
|
| 171 |
+
y_pred_validate = model.predict(X_validate, batch_size=batch_size, verbose=1)
|
| 172 |
+
y_pred_bool_validate = np.argmax(y_pred_validate, axis=1)
|
| 173 |
+
|
| 174 |
+
# print captured performance versus validate set
|
| 175 |
+
print("validate results", file=open(fileOut, "a"))
|
| 176 |
+
print(classification_report(np.argmax(y_validate, axis=1),
|
| 177 |
+
y_pred_bool_validate), file=open(fileOut, "a"))
|
| 178 |
+
print(confusion_matrix(np.argmax(y_validate, axis=1),
|
| 179 |
+
y_pred_bool_validate), file=open(fileOut, "a"))
|
| 180 |
+
|
| 181 |
+
# save model and move to next
|
| 182 |
+
model.save(modelOut, include_optimizer=False)
|
| 183 |
+
|
| 184 |
+
#create a dataframe for validate
|
| 185 |
+
data2 = {'Predict': y_pred_bool_validate, 'Actual':np.argmax(y_validate, axis=1), 'PredictScore':y_pred_validate[:,1]}
|
| 186 |
+
|
| 187 |
+
df_data_val2 = pd.DataFrame(data2)
|
| 188 |
+
df_data_val2.to_csv(vFile, index=False)
|
| 189 |
+
|