# -*- coding: utf-8 -*- """Final Project Warfarin.ipynb Automatically generated by Colaboratory. Original file is located at https://colab.research.google.com/drive/14SbhVS2m0dw-57GiRSq4rwt023z32HlI Question 1: How the data is preprocessed? How the data is loaded into Python to start the ML pipeline? --- """ # importing libraries import pandas as pd import numpy as np import seaborn as sns import matplotlib.pyplot as plt import pickle import warnings warnings.filterwarnings('ignore') from datasets import load_dataset df= load_dataset("shyamsankeerth/dataset1") df.head() #print the first five rows of the dataset df.dtypes #printing data types of the columns or features df.shape #print dimensions of the dataframe df.columns #printing column names of the dataframe df.drop(['Cyp2C9 genotypes','Target INR'],axis=1,inplace=True)#dropping the columns df.shape #printing the dimensions of the column df['VKORC1 genotype:'] = df['VKORC1 genotype: -1639 G>A (3673); chr16:31015190; rs9923231; C/T'] # renaming the columns df.drop('VKORC1 genotype: -1639 G>A (3673); chr16:31015190; rs9923231; C/T',axis=1,inplace=True) #removing the column null_counts=df.isna().sum() #print Nan values or null values print(null_counts) plt.figure(figsize=(6, 6)) # Set the figure size plt.bar(null_counts.index, null_counts.values, color='green') # Plot the bars plt.xlabel('Columns') # Set the x-axis label plt.ylabel('Null Values Count') # Set the y-axis label plt.title('Null Values in DataFrame') # Set the title plt.xticks(rotation=90) # Rotate x-axis labels if needed plt.show() # numeric_cols = [] # Separate the data into numeric_cols and categorical_cols for filling null values. categorical_cols = [] for col in df.columns: if df[col].dtype in ['float64']: # Consider int64 and float64 columns as numeric numeric_cols.append(col) elif df[col].dtype == 'object': # Consider object columns as categorical categorical_cols.append(col) else: print(col) categorical_cols #printing the srting columns numeric_cols s_df = df[categorical_cols] s_df.head() #printing the top 5 rows of string columns n_df = df[numeric_cols] #new dataframe for float columns n_df.head() #printing top five rows of float columns from sklearn.impute import SimpleImputer # for filling the nul values from sklearn.experimental import enable_iterative_imputer from sklearn.impute import IterativeImputer imputer = IterativeImputer() nums = imputer.fit_transform(n_df) print(nums) imputer = SimpleImputer(strategy="most_frequent") #filling most frequent values s = imputer.fit_transform(s_df) s f_1 = pd.DataFrame(s,columns=s_df.columns) f_2 = pd.DataFrame(nums,columns=n_df.columns) final_df = pd.concat([f_2,f_1],axis=1) # combining both str,float columns and make a new data frame print(final_df) print(final_df.shape) null_counts1=final_df.isna().sum() #print Nan values or null values after imutatio techiques print(null_counts1) #New barlot to show handling of null values import seaborn as sns import matplotlib.pyplot as plt plt.figure(figsize=(6, 6)) missing_data = final_df.isna() sns.barplot(x=missing_data.columns, y=missing_data.sum()) plt.xlabel('Columns') plt.ylabel('Count of Missing Values') plt.title('Missing Values by Column') plt.xticks(rotation=90) plt.show() """Question 3: How the categorial textual features are handled?""" def feature_plot(cols): num_cols = len(cols)+1 num_rows = num_cols//2 num_cols_per_row = 2 if num_cols <=5: fig = plt.figure(figsize=(12, 8)) else: fig=plt.figure(figsize=(20,12)) for i, col in enumerate(cols): ax = fig.add_subplot(num_rows, num_cols_per_row, i+1) ax.hist(final_df[col], alpha=0.5) ax.set_title(col) plt.tight_layout() plt.show() feature_plot(categorical_cols) #plot for all string columns feature_plot(numeric_cols) #plot for all string columns from sklearn.preprocessing import LabelEncoder # transforming the text data to readable def tranform_text(col): #using transform_text function for transforming lb = LabelEncoder() #creating a label encoder named lb final_df[col] = lb.fit_transform(final_df[col]) #using fit transform method to encode the values print(col, ':', lb.classes_) for col in categorical_cols: tranform_text(col) #transform for all column features final_df.head() #printing the top 5 rows """Do any of the variables have multicollinearity issues?""" plt.figure(figsize=(10, 8)) # Adjust the figure size sns.heatmap(final_df.corr(), cmap="RdYlBu", annot=True) # Mask correlations below a certain threshold and customize the colormap plt.title("Correlation Matrix") # Add a title to the plot plt.show() #printing the heatmap # weight,height,gender are identifed as best features plt.figure(figsize=(10, 8)) # Adjust the figure size sns.heatmap(final_df.corr(), cmap="RdYlBu", annot=True, mask=final_df.corr() < 0.5) # Maskig the correlatio values which are greater tha 0.5 plt.title("Correlation Matrix") # Add a title to the plot plt.show() """FEATURE SCALING""" from sklearn.preprocessing import StandardScaler #importing required libraries from sklearn.model_selection import train_test_split new_df = final_df.copy() #crearting a new dataframe new_df.shape #Printing the dimensions of new data frame def clf_data(x): #tranforming the label data if x>30: return 1 # HRD else: return 0 #LRD new_df['Therapeutic Dose of Warfarin'] = new_df['Therapeutic Dose of Warfarin'].apply(clf_data) #transforming the finalized values to the dataframe new_df['Therapeutic Dose of Warfarin'] X =new_df.drop('Therapeutic Dose of Warfarin',axis=1) #dropping the target variable Y = new_df['Therapeutic Dose of Warfarin'] #took target variable as Y x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state=90) #split of the training and test dataset print("X-train: ",x_train.shape) #printing the dimensions of all the split datasetts print("X-test: ",x_test.shape) print("Y-train ",y_train.shape) print("Y-test",y_test.shape) """Question 4: How to identify anomaly data points/outliers (i.e., Cook's Distance, one-class SVM)""" #CLASSIFICATION MODELS from sklearn.svm import SVC #importing all the required packages from libraries from sklearn.linear_model import LogisticRegression from sklearn.ensemble import RandomForestClassifier from sklearn.neighbors import KNeighborsClassifier from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,confusion_matrix #importing all the required metrics import keras from keras.models import Sequential from keras.layers import Dense,Flatten # NN model def modelBuilding(clf): clf.fit(x_train,y_train) #fitting the classifierr clf on training data y_pred = clf.predict(x_test) #predicting the lables for the test data print('Accuracy-Test:',accuracy_score(y_test,y_pred)) #printing all the required metrics print('Precision:',precision_score(y_test,y_pred)) print('Recall:',recall_score(y_test,y_pred)) print('F1-Score:',f1_score(y_test,y_pred)) print() print() print(clf) cm = confusion_matrix(y_test, y_pred) #creating a confusion matrix for better analysis sns.heatmap(cm, annot=True,fmt='.2f',cbar=False,cmap='Greens') #creating an heatmap for the confusion matrix plt.show() #displaying the heatmap list1 = ['SVC()', 'LogisticRegression()', 'KNeighborsClassifier()', 'RandomForestClassifier()'] classifiers = [] for item in list1: classifier = eval(item) classifiers.append(classifier) for i in classifiers: modelBuilding(i) #PARAMETERS TUNING from sklearn.model_selection import RandomizedSearchCV #gridcv #performed paramtere tuning with randomsearch cv def model_Tuning(model,params): #randomised search model = RandomizedSearchCV(model,param_distributions=params,cv=5,scoring='accuracy',n_jobs=-1) model.fit(x_train,y_train) #printing the best parameters print('Best Params:',model.best_params_) # getting best params print('Best Score:',model.best_score_) # getting best scores return model.best_params_ #return the best parameters # KNN params = { 'n_neighbors' : [5,7,9,11], 'weights' : ['uniform','distance'], 'metric' : ['minkowski','euclidean','manhattan']} #printing the best paramteres and the metrics #tuning the knn parametyers mt = model_Tuning(KNeighborsClassifier(),params) knn = KNeighborsClassifier(n_neighbors=mt['n_neighbors'],weights=mt['weights'],metric =mt['metric']) modelBuilding(knn) #printing the best paramters params = {'C': [0.1, 1, 10, 100, 1000], 'gamma': [1, 0.1, 0.001, 0.0001], 'kernel': ['linear', 'rbf']} mt = model_Tuning(SVC(), params) svm = SVC(C=mt['C'], gamma=mt['gamma'], kernel=mt['kernel']) modelBuilding(svm) params = {'bootstrap': [True, False], 'max_depth': [10, 20, 25], 'min_samples_leaf': [1, 2, 3], 'max_features': ['auto', 'sqrt'], 'min_samples_split': [2, 5, 7], 'n_estimators': [200, 400, 600, 800, 1000]} #printing the best paramteres and the metrics mt = model_Tuning(RandomForestClassifier(),params) #random forest #paramters tuning for random forest rf = RandomForestClassifier(bootstrap=mt['bootstrap'],max_depth=mt['max_depth'],max_features=mt['max_features'],min_samples_leaf=mt['min_samples_leaf'],min_samples_split=mt['min_samples_split'],n_estimators=mt['n_estimators']) modelBuilding(rf) params = { 'penalty':['l1','l2'], 'C':[0.001,0.01,0.1,1,10,100], } #printing the best paramteres and the metrics mt = model_Tuning(LogisticRegression(),params) lr = LogisticRegression(penalty=mt['penalty'],C=mt['C']) modelBuilding(lr) """Neural Network - Model""" y_train = np.asarray(y_train).astype('float32').reshape((-1,1)) # reshaping the data y_test = np.asarray(y_test).astype('float32').reshape((-1,1)) # Neural network model = Sequential() model.add(Dense(256, input_dim=10, activation='relu')) #input features as 10 function relu model.add(Dense(128, activation='relu')) #neurons 128 model.add(Flatten()) model.add(Dense(1, activation='sigmoid')) model.summary() model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) #using binary crossentropy history = model.fit(x_train, y_train, epochs=100, batch_size=28,validation_data=(x_test, y_test)) #fit the model on train data # plotting test and train accuracy plt.plot(history.history['accuracy']) #plotting the training accuracy plt.plot(history.history['val_accuracy']) #plotting the validation accuracy plt.title('Model accuracy') #setting the title for the plot plt.ylabel('Accuracy') #Setting Y axis label plt.xlabel('Epoch') #set x axis label plt.legend(['Train', 'Test'], loc='upper right') #adding legend to the plot plt.show() #displaying the plot # plotting test and train loss plt.plot(history.history['loss']) #plotting the training loss plt.plot(history.history['val_loss']) #plotting the validation loss plt.title('Model loss') #setting the name for the plot plt.ylabel('Loss') #setting y -axis plt.xlabel('Epoch') #setting x -axis plt.legend(['Train', 'Test'], loc='upper right') #adding legend plt.show() """ Regression Models""" X = final_df.drop('Therapeutic Dose of Warfarin',axis=1) #dropping target variable in the Xdataframe Y = final_df['Therapeutic Dose of Warfarin'] #setting target variable in Y dataframe std = StandardScaler() #standarization X = std.fit_transform(X) # Feature scaling x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state=90) #splitting the from sklearn.linear_model import LinearRegression # importing all the required packages from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error def regressor(model): #defining a regression model model.fit(x_train,y_train) #fit transform on traindata y_pred = model.predict(x_test) #prediction on test data #printing all the required metrics print('R2-Score:',r2_score(y_test,y_pred)) print('MSE:',mean_squared_error(y_test,y_pred)) print('MAE:',mean_absolute_error(y_test,y_pred)) # Linear Regression lir = LinearRegression() regressor(lir) # Random Forest rfr = RandomForestRegressor() regressor(rfr) # Tree Regression tree = DecisionTreeRegressor() regressor(tree) rf.feature_importances_ # selecting the features new_df.columns #printing the features of the dataset new11 = new_df.drop(columns=['Therapeutic Dose of Warfarin']) plt.barh(new11.columns, rf.feature_importances_,color='g') #plot bar graph for all the features feature_df = new_df[['Weight (kg)','Height (cm)','INR on Reported Therapeutic Dose of Warfarin','Age','VKORC1 genotype:','Therapeutic Dose of Warfarin']] feature_df.head() # Top 5 features. feature_df.shape #printing the dimensions of the dataframe X = feature_df.drop('Therapeutic Dose of Warfarin',axis=1) #dropping the target variable for the new dataframe Y = feature_df['Therapeutic Dose of Warfarin'] #Y has the target feature std = StandardScaler() X = std.fit_transform(X) # Feature scaling x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state=90) #splitting the data into training and test sets """FINAL MODELS FOR PREDICTION""" def model_buliding(model,filename): #Final mdoels and saving the models clf = model.fit(x_train,y_train) y_pred = model.predict(x_test) #model_save pickle.dump(model, open(filename, 'wb')) # Saving the model print('Accuracy-Test:',accuracy_score(y_test,y_pred)) # accuracy print('Precision:',precision_score(y_test,y_pred)) # Precision print('Recall:',recall_score(y_test,y_pred)) # recall print('F1-Score:',f1_score(y_test,y_pred)) # f1-score cm = confusion_matrix(y_test, y_pred) # ploting confusion matrix sns.heatmap(cm, annot=True,fmt='.2f',cbar=False,cmap='Greens') plt.show() model_buliding(lr,'lr.pkl') # logistic Regression model_buliding(rf,'rf.pkl') # Random Forest model_buliding(svm,'svm.pkl') # SVM model_buliding(knn,'knn.pkl') # K_NN y_train = np.asarray(y_train).astype('float32').reshape((-1,1)) y_test = np.asarray(y_test).astype('float32').reshape((-1,1)) y_test.shape #printing the dimensions # Neural network model = Sequential() model.add(Dense(256, input_dim=5, activation='relu')) #took only top 5 features as input model.add(Dense(148, activation='relu')) model.add(Dense(64, activation='relu')) model.add(Flatten()) model.add(Dense(1, activation='sigmoid')) model.summary() model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) #compiling the model history = model.fit(x_train, y_train, epochs=100, batch_size=32,validation_data=(x_test, y_test)) #fitting the model #plotting the model accuracy plt.plot(history.history['accuracy']) plt.plot(history.history['val_accuracy']) plt.title('Model accuracy') plt.ylabel('Accuracy') plt.xlabel('Epoch') plt.legend(['Train', 'Test'], loc='upper right') plt.show() #plotting the model loss plt.plot(history.history['loss']) plt.plot(history.history['val_loss']) plt.title('Model LOSS') plt.ylabel('LOSS') plt.xlabel('Epoch') plt.legend(['Train', 'Test'], loc='upper right') plt.show() y_pred = np.argmax(model.predict(x_test),axis=1) #generating the predictions using test data print('Accuracy-Test:',accuracy_score(y_test,y_pred)) # accuracy print('Precision:',precision_score(y_test,y_pred)) # Precision print('Recall:',recall_score(y_test,y_pred)) # recall print('F1-Score:',f1_score(y_test,y_pred)) # f1-score cm = confusion_matrix(y_test, y_pred) # ploting confusion matrix sns.heatmap(cm, annot=True,fmt='.2f',cbar=False,cmap='Greens') plt.show() """Gradio Implementation""" import gradio as gd def model_prediction(weight, height,INR_on_Reported,age,VKROC1,model): data = [[weight,height,INR_on_Reported,age,VKROC1]] # getting inputs data = std.transform(data) # scaling the data if model=='Logistic': # loading models file = open("/content/lr.pkl",'rb') model = pickle.load(file) file.close() prediction = model.predict(data)[0] #prediction elif model=='KNN': #loading KNN model file = open("/content/knn.pkl",'rb') model = pickle.load(file) file.close() prediction = model.predict(data)[0] elif model=='SVM': #loading SVM model file = open("/content/svm.pkl",'rb') model = pickle.load(file) file.close() prediction = model.predict(data)[0] elif model=='RForest': #loading randomforest nodel file = open("/content/rf.pkl",'rb') model = pickle.load(file) file.close() prediction = model.predict(data)[0] if prediction == 1: # Final Outputs return 'HRD(1)' else: return 'LRD(0)' return prediction gd.Interface(fn=model_prediction,inputs=["text", "text","text","text","text",gd.Dropdown(['Logistic','KNN','SVM','RForest'])], outputs=["text"]).launch() feature_df.head(3)