|
|
|
|
|
import os as os |
|
|
import pandas as pd |
|
|
import numpy as np |
|
|
import tensorflow as tf |
|
|
tf.config.list_physical_devices("GPU") |
|
|
from tensorflow.python import keras |
|
|
from keras.layers import LSTM, Bidirectional |
|
|
from keras.layers import Dense, Embedding |
|
|
from keras.models import Sequential |
|
|
from keras.preprocessing import sequence |
|
|
import string |
|
|
from sklearn.metrics import classification_report, confusion_matrix |
|
|
from sklearn.model_selection import train_test_split |
|
|
from sklearn.feature_extraction.text import CountVectorizer |
|
|
from keras.utils import to_categorical |
|
|
|
|
|
|
|
|
|
|
|
gpus = tf.config.list_physical_devices('GPU') |
|
|
if gpus: |
|
|
|
|
|
try: |
|
|
tf.config.set_visible_devices(gpus[3], 'GPU') |
|
|
logical_gpus = tf.config.list_logical_devices('GPU') |
|
|
print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU") |
|
|
except RuntimeError as e: |
|
|
|
|
|
print(e) |
|
|
|
|
|
|
|
|
chars = ['E'] + [chr(i) for i in range(97,123)] + [' ', 'U'] |
|
|
id2char = {i:j for i,j in enumerate(chars)} |
|
|
char2id = {j:i for i,j in enumerate(chars)} |
|
|
|
|
|
|
|
|
def name2id(name, l = 10): |
|
|
ids = [0] * l |
|
|
for i, c in enumerate(name): |
|
|
if i < l: |
|
|
if c.isalpha(): |
|
|
ids[i] = char2id.get(c, char2id['U']) |
|
|
elif c in string.punctuation: |
|
|
ids[i] = char2id.get(c, char2id[' ']) |
|
|
else: |
|
|
ids[i] = char2id.get(c, char2id['U']) |
|
|
return ids |
|
|
|
|
|
|
|
|
os.chdir('/home/lstm/expermiment_data') |
|
|
|
|
|
data_directory = os.listdir('/home/lstm/expermiment_data') |
|
|
|
|
|
|
|
|
|
|
|
dataFiles = [file for file in data_directory if file.endswith('.csv')] |
|
|
print("experiment files available: ", dataFiles) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
dataFiles = ["minorMulti_50.csv"] |
|
|
dFile=dataFiles[0] |
|
|
for dFile in dataFiles: |
|
|
|
|
|
|
|
|
fileOut=["/home/lstm/test_results/", dFile.split(".")[0],"_testLr3_5.csv"] |
|
|
fileOut = "".join(fileOut) |
|
|
|
|
|
modelOut = ["/home/lstm/models/", dFile.split(".")[0], "_Lr3_5.h5"] |
|
|
modelOut = "".join(modelOut) |
|
|
|
|
|
vFile = ["/home/lstm/validate_results/", dFile.split(".")[0],"_validateLr3_5.csv"] |
|
|
vFile = "".join(vFile) |
|
|
|
|
|
dFile = ["/home/lstm/expermiment_data/", dFile] |
|
|
dFile = "".join(dFile) |
|
|
|
|
|
df = pd.read_csv(dFile) |
|
|
df = df.dropna() |
|
|
df = df.astype({"lname" : str,"fname" : str}) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
X_train = [name2id(fn.lower(), l=12) + name2id(ln.lower(), l=16) |
|
|
for fn, ln in zip(df['fname'][df['folds']>2], df['lname'][df['folds']>2])] |
|
|
y_train = [int(i) for i in df['ethnicity'][df['folds']>2].tolist()] |
|
|
|
|
|
X_test = [name2id(fn.lower(), l=12) + name2id(ln.lower(), l=16) |
|
|
for fn, ln in zip(df['fname'][df['folds']==1], df['lname'][df['folds']==1])] |
|
|
y_test = [int(i) for i in df['ethnicity'][df['folds']==1].tolist()] |
|
|
|
|
|
X_validate = [name2id(fn.lower(), l=12) + name2id(ln.lower(), l=16) |
|
|
for fn, ln in zip(df['fname'][df['folds']==2], df['lname'][df['folds']==2])] |
|
|
y_validate = [int(i) for i in df['ethnicity'][df['folds']==2].tolist()] |
|
|
|
|
|
|
|
|
|
|
|
num_words = len(id2char) |
|
|
feature_len = 28 |
|
|
batch_size = 512 |
|
|
|
|
|
print(len(X_train), 'train sequences') |
|
|
print(len(X_test), 'test sequences') |
|
|
print(len(X_validate), 'validate sequences') |
|
|
|
|
|
print('Pad sequences (samples x time)') |
|
|
X_train = sequence.pad_sequences(X_train, maxlen=feature_len) |
|
|
X_test = sequence.pad_sequences(X_test, maxlen=feature_len) |
|
|
X_validate=sequence.pad_sequences(X_validate, maxlen=feature_len) |
|
|
print('X_train shape:', X_train.shape) |
|
|
print('X_test shape:', X_test.shape) |
|
|
print('X_validate shape:', X_test.shape) |
|
|
|
|
|
|
|
|
num_classes = 2 |
|
|
print(num_classes, 'classes') |
|
|
|
|
|
print('Convert class vector to binary class matrix ' |
|
|
'(for use with categorical_crossentropy)') |
|
|
y_train = to_categorical(y_train, num_classes) |
|
|
y_test = to_categorical(y_test, num_classes) |
|
|
y_validate = to_categorical(y_validate, num_classes) |
|
|
|
|
|
print('y_train shape:', y_train.shape) |
|
|
print('y_test shape:', y_test.shape) |
|
|
print('y_validate shape:', y_validate.shape) |
|
|
|
|
|
|
|
|
|
|
|
model = Sequential() |
|
|
model.add(Embedding(num_words, 256, input_length=feature_len)) |
|
|
|
|
|
model.add(Bidirectional(LSTM(512, return_sequences=True, dropout=0.2))) |
|
|
model.add(Bidirectional(LSTM(512, return_sequences=True, dropout=0.2))) |
|
|
model.add(Bidirectional(LSTM(512, return_sequences=True, dropout=0.2))) |
|
|
model.add(Bidirectional(LSTM(512, dropout=0.2))) |
|
|
model.add(Dense(num_classes, activation='softmax')) |
|
|
|
|
|
|
|
|
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=10 ** -3.5), |
|
|
loss='categorical_crossentropy', |
|
|
metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()]) |
|
|
|
|
|
callback = tf.keras.callbacks.EarlyStopping( |
|
|
mode='min', monitor='loss', patience=1, min_delta=.001) |
|
|
|
|
|
|
|
|
model.fit(X_train, y_train, batch_size=batch_size, epochs=10, |
|
|
validation_data=(X_test, y_test), verbose=1, callbacks=[callback]) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
y_pred = model.predict(X_test, batch_size=batch_size, verbose=1) |
|
|
y_pred_bool = np.argmax(y_pred, axis=1) |
|
|
|
|
|
|
|
|
print("test results", file=open(fileOut, "a")) |
|
|
print(classification_report(np.argmax(y_test, axis=1), |
|
|
y_pred_bool), file=open(fileOut, "a")) |
|
|
print(confusion_matrix(np.argmax(y_test, axis=1), |
|
|
y_pred_bool), file=open(fileOut, "a")) |
|
|
|
|
|
|
|
|
y_pred_validate = model.predict(X_validate, batch_size=batch_size, verbose=1) |
|
|
y_pred_bool_validate = np.argmax(y_pred_validate, axis=1) |
|
|
|
|
|
|
|
|
print("validate results", file=open(fileOut, "a")) |
|
|
print(classification_report(np.argmax(y_validate, axis=1), |
|
|
y_pred_bool_validate), file=open(fileOut, "a")) |
|
|
print(confusion_matrix(np.argmax(y_validate, axis=1), |
|
|
y_pred_bool_validate), file=open(fileOut, "a")) |
|
|
|
|
|
|
|
|
model.save(modelOut, include_optimizer=False) |
|
|
|
|
|
|
|
|
data2 = {'Predict': y_pred_bool_validate, 'Actual':np.argmax(y_validate, axis=1), 'PredictScore':y_pred_validate[:,1]} |
|
|
|
|
|
df_data_val2 = pd.DataFrame(data2) |
|
|
df_data_val2.to_csv(vFile, index=False) |
|
|
|