niveabekal04 commited on
Commit
6fc89a8
·
1 Parent(s): eda8b78

Upload hispMaxDupFullData_fit.py

Browse files
Files changed (1) hide show
  1. hispMaxDupFullData_fit.py +189 -0
hispMaxDupFullData_fit.py ADDED
@@ -0,0 +1,189 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # initialize tf/kera and/or whatver else you need here
2
+ import os as os
3
+ import pandas as pd
4
+ import numpy as np
5
+ import tensorflow as tf
6
+ tf.config.list_physical_devices("GPU")
7
+ from tensorflow.python import keras
8
+ from keras.layers import LSTM, Bidirectional
9
+ from keras.layers import Dense, Embedding
10
+ from keras.models import Sequential
11
+ from keras.preprocessing import sequence
12
+ import string
13
+ from sklearn.metrics import classification_report, confusion_matrix
14
+ from sklearn.model_selection import train_test_split
15
+ from sklearn.feature_extraction.text import CountVectorizer
16
+ from keras.utils import to_categorical
17
+
18
+
19
+ #initializing to run the file on GPU (specify number to run one file in one gpu)
20
+ gpus = tf.config.list_physical_devices('GPU')
21
+ if gpus:
22
+ # Restrict TensorFlow to only use the 3rd GPU
23
+ try:
24
+ tf.config.set_visible_devices(gpus[3], 'GPU')
25
+ logical_gpus = tf.config.list_logical_devices('GPU')
26
+ print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU")
27
+ except RuntimeError as e:
28
+ # Visible devices must be set before GPUs have been initialized
29
+ print(e)
30
+
31
+ # create ASCII dictionary
32
+ chars = ['E'] + [chr(i) for i in range(97,123)] + [' ', 'U']
33
+ id2char = {i:j for i,j in enumerate(chars)}
34
+ char2id = {j:i for i,j in enumerate(chars)}
35
+
36
+
37
+ def name2id(name, l = 10):
38
+ ids = [0] * l
39
+ for i, c in enumerate(name):
40
+ if i < l:
41
+ if c.isalpha():
42
+ ids[i] = char2id.get(c, char2id['U'])
43
+ elif c in string.punctuation:
44
+ ids[i] = char2id.get(c, char2id[' '])
45
+ else:
46
+ ids[i] = char2id.get(c, char2id['U'])
47
+ return ids
48
+
49
+
50
+ os.chdir('/home/lstm/expermiment_data')
51
+ # directory containing experiment dataset files
52
+ data_directory = os.listdir('/home/lstm/expermiment_data')
53
+
54
+
55
+ #getting a list of data files in the directorys
56
+ dataFiles = [file for file in data_directory if file.endswith('.csv')]
57
+ print("experiment files available: ", dataFiles)
58
+
59
+ # ----- Any Pre-Processing goes here ----
60
+
61
+ # ---- Create a file that captures the stats as they come off the models ----
62
+ # ------ Now Loop over Data Expertiment files capturing stats and models as you go -----
63
+ #for now hardcoding each file
64
+ dataFiles = ["minorMulti_50.csv"]
65
+ dFile=dataFiles[0]
66
+ for dFile in dataFiles:
67
+
68
+ #copy the experiment file to test directory with the new name
69
+ fileOut=["/home/lstm/test_results/", dFile.split(".")[0],"_testLr3_5.csv"]
70
+ fileOut = "".join(fileOut)
71
+ # create model file name by stripping input data file name
72
+ modelOut = ["/home/lstm/models/", dFile.split(".")[0], "_Lr3_5.h5"] # tracked name lengths in file name
73
+ modelOut = "".join(modelOut)
74
+ #for validate set
75
+ vFile = ["/home/lstm/validate_results/", dFile.split(".")[0],"_validateLr3_5.csv"]
76
+ vFile = "".join(vFile)
77
+ # Build data file input name
78
+ dFile = ["/home/lstm/expermiment_data/", dFile] #already reading in near data directory when prompting user
79
+ dFile = "".join(dFile)
80
+ # read in data and drop incomplete cases (JIC)
81
+ df = pd.read_csv(dFile)
82
+ df = df.dropna()
83
+ df = df.astype({"lname" : str,"fname" : str})
84
+ #sampling
85
+ #df = df.sample(frac=0.1)
86
+
87
+ # Convert to numeric representation
88
+ #getting the data into train (folds > 2), test (fold =1) and validate (folds=2)
89
+ X_train = [name2id(fn.lower(), l=12) + name2id(ln.lower(), l=16)
90
+ for fn, ln in zip(df['fname'][df['folds']>2], df['lname'][df['folds']>2])]
91
+ y_train = [int(i) for i in df['ethnicity'][df['folds']>2].tolist()]
92
+
93
+ X_test = [name2id(fn.lower(), l=12) + name2id(ln.lower(), l=16)
94
+ for fn, ln in zip(df['fname'][df['folds']==1], df['lname'][df['folds']==1])]
95
+ y_test = [int(i) for i in df['ethnicity'][df['folds']==1].tolist()]
96
+
97
+ X_validate = [name2id(fn.lower(), l=12) + name2id(ln.lower(), l=16)
98
+ for fn, ln in zip(df['fname'][df['folds']==2], df['lname'][df['folds']==2])]
99
+ y_validate = [int(i) for i in df['ethnicity'][df['folds']==2].tolist()]
100
+
101
+
102
+ # cut texts after this number of words (among top max_features most common words)
103
+ num_words = len(id2char)
104
+ feature_len = 28
105
+ batch_size = 512
106
+
107
+ print(len(X_train), 'train sequences')
108
+ print(len(X_test), 'test sequences')
109
+ print(len(X_validate), 'validate sequences')
110
+
111
+ print('Pad sequences (samples x time)')
112
+ X_train = sequence.pad_sequences(X_train, maxlen=feature_len)
113
+ X_test = sequence.pad_sequences(X_test, maxlen=feature_len)
114
+ X_validate=sequence.pad_sequences(X_validate, maxlen=feature_len)
115
+ print('X_train shape:', X_train.shape)
116
+ print('X_test shape:', X_test.shape)
117
+ print('X_validate shape:', X_test.shape)
118
+
119
+
120
+ num_classes = 2 # np.max(y_train) + 1
121
+ print(num_classes, 'classes')
122
+
123
+ print('Convert class vector to binary class matrix '
124
+ '(for use with categorical_crossentropy)')
125
+ y_train = to_categorical(y_train, num_classes)
126
+ y_test = to_categorical(y_test, num_classes)
127
+ y_validate = to_categorical(y_validate, num_classes)
128
+
129
+ print('y_train shape:', y_train.shape)
130
+ print('y_test shape:', y_test.shape)
131
+ print('y_validate shape:', y_validate.shape)
132
+
133
+ # simple train-test
134
+ # first build
135
+ model = Sequential()
136
+ model.add(Embedding(num_words, 256, input_length=feature_len))
137
+ # try out bi-directional LSTM
138
+ model.add(Bidirectional(LSTM(512, return_sequences=True, dropout=0.2)))
139
+ model.add(Bidirectional(LSTM(512, return_sequences=True, dropout=0.2)))
140
+ model.add(Bidirectional(LSTM(512, return_sequences=True, dropout=0.2)))
141
+ model.add(Bidirectional(LSTM(512, dropout=0.2)))
142
+ model.add(Dense(num_classes, activation='softmax'))
143
+
144
+ # choose between learning rates
145
+ model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=10 ** -3.5), #earlier learning rate was 1^-3
146
+ loss='categorical_crossentropy',
147
+ metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])
148
+
149
+ callback = tf.keras.callbacks.EarlyStopping(
150
+ mode='min', monitor='loss', patience=1, min_delta=.001) #earlier delta was 0.0015
151
+
152
+ # train model
153
+ model.fit(X_train, y_train, batch_size=batch_size, epochs=10,
154
+ validation_data=(X_test, y_test), verbose=1, callbacks=[callback]) #increasing the epoch limit to 15, ealrier it was 10
155
+
156
+ # score, acc = model.evaluate(
157
+ # X_test, y_test, batch_size=batch_size, verbose=1)
158
+
159
+ # get predicitions on test data
160
+ y_pred = model.predict(X_test, batch_size=batch_size, verbose=1)
161
+ y_pred_bool = np.argmax(y_pred, axis=1)
162
+
163
+ # print captured performance versus test set
164
+ print("test results", file=open(fileOut, "a"))
165
+ print(classification_report(np.argmax(y_test, axis=1),
166
+ y_pred_bool), file=open(fileOut, "a"))
167
+ print(confusion_matrix(np.argmax(y_test, axis=1),
168
+ y_pred_bool), file=open(fileOut, "a"))
169
+
170
+ # get predicitions on validate data
171
+ y_pred_validate = model.predict(X_validate, batch_size=batch_size, verbose=1)
172
+ y_pred_bool_validate = np.argmax(y_pred_validate, axis=1)
173
+
174
+ # print captured performance versus validate set
175
+ print("validate results", file=open(fileOut, "a"))
176
+ print(classification_report(np.argmax(y_validate, axis=1),
177
+ y_pred_bool_validate), file=open(fileOut, "a"))
178
+ print(confusion_matrix(np.argmax(y_validate, axis=1),
179
+ y_pred_bool_validate), file=open(fileOut, "a"))
180
+
181
+ # save model and move to next
182
+ model.save(modelOut, include_optimizer=False)
183
+
184
+ #create a dataframe for validate
185
+ data2 = {'Predict': y_pred_bool_validate, 'Actual':np.argmax(y_validate, axis=1), 'PredictScore':y_pred_validate[:,1]}
186
+
187
+ df_data_val2 = pd.DataFrame(data2)
188
+ df_data_val2.to_csv(vFile, index=False)
189
+