# initialize tf/kera and/or whatver else you need here import os as os import pandas as pd import numpy as np import tensorflow as tf tf.config.list_physical_devices("GPU") from tensorflow.python import keras from keras.layers import LSTM, Bidirectional from keras.layers import Dense, Embedding from keras.models import Sequential from keras.preprocessing import sequence import string from sklearn.metrics import classification_report, confusion_matrix from sklearn.model_selection import train_test_split from sklearn.feature_extraction.text import CountVectorizer from keras.utils import to_categorical #initializing to run the file on GPU (specify number to run one file in one gpu) gpus = tf.config.list_physical_devices('GPU') if gpus: # Restrict TensorFlow to only use the 3rd GPU try: tf.config.set_visible_devices(gpus[3], 'GPU') logical_gpus = tf.config.list_logical_devices('GPU') print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPU") except RuntimeError as e: # Visible devices must be set before GPUs have been initialized print(e) # create ASCII dictionary chars = ['E'] + [chr(i) for i in range(97,123)] + [' ', 'U'] id2char = {i:j for i,j in enumerate(chars)} char2id = {j:i for i,j in enumerate(chars)} def name2id(name, l = 10): ids = [0] * l for i, c in enumerate(name): if i < l: if c.isalpha(): ids[i] = char2id.get(c, char2id['U']) elif c in string.punctuation: ids[i] = char2id.get(c, char2id[' ']) else: ids[i] = char2id.get(c, char2id['U']) return ids os.chdir('/home/lstm/expermiment_data') # directory containing experiment dataset files data_directory = os.listdir('/home/lstm/expermiment_data') #getting a list of data files in the directorys dataFiles = [file for file in data_directory if file.endswith('.csv')] print("experiment files available: ", dataFiles) # ----- Any Pre-Processing goes here ---- # ---- Create a file that captures the stats as they come off the models ---- # ------ Now Loop over Data Expertiment files capturing stats and models as you go ----- #for now hardcoding each file dataFiles = ["minorMulti_50.csv"] dFile=dataFiles[0] for dFile in dataFiles: #copy the experiment file to test directory with the new name fileOut=["/home/lstm/test_results/", dFile.split(".")[0],"_testLr3_5.csv"] fileOut = "".join(fileOut) # create model file name by stripping input data file name modelOut = ["/home/lstm/models/", dFile.split(".")[0], "_Lr3_5.h5"] # tracked name lengths in file name modelOut = "".join(modelOut) #for validate set vFile = ["/home/lstm/validate_results/", dFile.split(".")[0],"_validateLr3_5.csv"] vFile = "".join(vFile) # Build data file input name dFile = ["/home/lstm/expermiment_data/", dFile] #already reading in near data directory when prompting user dFile = "".join(dFile) # read in data and drop incomplete cases (JIC) df = pd.read_csv(dFile) df = df.dropna() df = df.astype({"lname" : str,"fname" : str}) #sampling #df = df.sample(frac=0.1) # Convert to numeric representation #getting the data into train (folds > 2), test (fold =1) and validate (folds=2) X_train = [name2id(fn.lower(), l=12) + name2id(ln.lower(), l=16) for fn, ln in zip(df['fname'][df['folds']>2], df['lname'][df['folds']>2])] y_train = [int(i) for i in df['ethnicity'][df['folds']>2].tolist()] X_test = [name2id(fn.lower(), l=12) + name2id(ln.lower(), l=16) for fn, ln in zip(df['fname'][df['folds']==1], df['lname'][df['folds']==1])] y_test = [int(i) for i in df['ethnicity'][df['folds']==1].tolist()] X_validate = [name2id(fn.lower(), l=12) + name2id(ln.lower(), l=16) for fn, ln in zip(df['fname'][df['folds']==2], df['lname'][df['folds']==2])] y_validate = [int(i) for i in df['ethnicity'][df['folds']==2].tolist()] # cut texts after this number of words (among top max_features most common words) num_words = len(id2char) feature_len = 28 batch_size = 512 print(len(X_train), 'train sequences') print(len(X_test), 'test sequences') print(len(X_validate), 'validate sequences') print('Pad sequences (samples x time)') X_train = sequence.pad_sequences(X_train, maxlen=feature_len) X_test = sequence.pad_sequences(X_test, maxlen=feature_len) X_validate=sequence.pad_sequences(X_validate, maxlen=feature_len) print('X_train shape:', X_train.shape) print('X_test shape:', X_test.shape) print('X_validate shape:', X_test.shape) num_classes = 2 # np.max(y_train) + 1 print(num_classes, 'classes') print('Convert class vector to binary class matrix ' '(for use with categorical_crossentropy)') y_train = to_categorical(y_train, num_classes) y_test = to_categorical(y_test, num_classes) y_validate = to_categorical(y_validate, num_classes) print('y_train shape:', y_train.shape) print('y_test shape:', y_test.shape) print('y_validate shape:', y_validate.shape) # simple train-test # first build model = Sequential() model.add(Embedding(num_words, 256, input_length=feature_len)) # try out bi-directional LSTM model.add(Bidirectional(LSTM(512, return_sequences=True, dropout=0.2))) model.add(Bidirectional(LSTM(512, return_sequences=True, dropout=0.2))) model.add(Bidirectional(LSTM(512, return_sequences=True, dropout=0.2))) model.add(Bidirectional(LSTM(512, dropout=0.2))) model.add(Dense(num_classes, activation='softmax')) # choose between learning rates model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=10 ** -3.5), #earlier learning rate was 1^-3 loss='categorical_crossentropy', metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()]) callback = tf.keras.callbacks.EarlyStopping( mode='min', monitor='loss', patience=1, min_delta=.001) #earlier delta was 0.0015 # train model model.fit(X_train, y_train, batch_size=batch_size, epochs=10, validation_data=(X_test, y_test), verbose=1, callbacks=[callback]) #increasing the epoch limit to 15, ealrier it was 10 # score, acc = model.evaluate( # X_test, y_test, batch_size=batch_size, verbose=1) # get predicitions on test data y_pred = model.predict(X_test, batch_size=batch_size, verbose=1) y_pred_bool = np.argmax(y_pred, axis=1) # print captured performance versus test set print("test results", file=open(fileOut, "a")) print(classification_report(np.argmax(y_test, axis=1), y_pred_bool), file=open(fileOut, "a")) print(confusion_matrix(np.argmax(y_test, axis=1), y_pred_bool), file=open(fileOut, "a")) # get predicitions on validate data y_pred_validate = model.predict(X_validate, batch_size=batch_size, verbose=1) y_pred_bool_validate = np.argmax(y_pred_validate, axis=1) # print captured performance versus validate set print("validate results", file=open(fileOut, "a")) print(classification_report(np.argmax(y_validate, axis=1), y_pred_bool_validate), file=open(fileOut, "a")) print(confusion_matrix(np.argmax(y_validate, axis=1), y_pred_bool_validate), file=open(fileOut, "a")) # save model and move to next model.save(modelOut, include_optimizer=False) #create a dataframe for validate data2 = {'Predict': y_pred_bool_validate, 'Actual':np.argmax(y_validate, axis=1), 'PredictScore':y_pred_validate[:,1]} df_data_val2 = pd.DataFrame(data2) df_data_val2.to_csv(vFile, index=False)