Final_project / app.py
shyamsankeerth's picture
Update app.py
200f943
# -*- coding: utf-8 -*-
"""Final Project Warfarin.ipynb
Automatically generated by Colaboratory.
Original file is located at
https://colab.research.google.com/drive/14SbhVS2m0dw-57GiRSq4rwt023z32HlI
Question 1: How the data is preprocessed? How the data is loaded into Python to start the ML pipeline?
---
"""
# importing libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pickle
import warnings
warnings.filterwarnings('ignore')
from datasets import load_dataset
df= load_dataset("shyamsankeerth/dataset1")
df.head() #print the first five rows of the dataset
df.dtypes #printing data types of the columns or features
df.shape #print dimensions of the dataframe
df.columns #printing column names of the dataframe
df.drop(['Cyp2C9 genotypes','Target INR'],axis=1,inplace=True)#dropping the columns
df.shape #printing the dimensions of the column
df['VKORC1 genotype:'] = df['VKORC1 genotype: -1639 G>A (3673); chr16:31015190; rs9923231; C/T'] # renaming the columns
df.drop('VKORC1 genotype: -1639 G>A (3673); chr16:31015190; rs9923231; C/T',axis=1,inplace=True) #removing the column
null_counts=df.isna().sum() #print Nan values or null values
print(null_counts)
plt.figure(figsize=(6, 6)) # Set the figure size
plt.bar(null_counts.index, null_counts.values, color='green') # Plot the bars
plt.xlabel('Columns') # Set the x-axis label
plt.ylabel('Null Values Count') # Set the y-axis label
plt.title('Null Values in DataFrame') # Set the title
plt.xticks(rotation=90) # Rotate x-axis labels if needed
plt.show() #
numeric_cols = [] # Separate the data into numeric_cols and categorical_cols for filling null values.
categorical_cols = []
for col in df.columns:
if df[col].dtype in ['float64']: # Consider int64 and float64 columns as numeric
numeric_cols.append(col)
elif df[col].dtype == 'object': # Consider object columns as categorical
categorical_cols.append(col)
else:
print(col)
categorical_cols #printing the srting columns
numeric_cols
s_df = df[categorical_cols]
s_df.head() #printing the top 5 rows of string columns
n_df = df[numeric_cols] #new dataframe for float columns
n_df.head() #printing top five rows of float columns
from sklearn.impute import SimpleImputer # for filling the nul values
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
imputer = IterativeImputer()
nums = imputer.fit_transform(n_df)
print(nums)
imputer = SimpleImputer(strategy="most_frequent") #filling most frequent values
s = imputer.fit_transform(s_df)
s
f_1 = pd.DataFrame(s,columns=s_df.columns)
f_2 = pd.DataFrame(nums,columns=n_df.columns)
final_df = pd.concat([f_2,f_1],axis=1) # combining both str,float columns and make a new data frame
print(final_df)
print(final_df.shape)
null_counts1=final_df.isna().sum() #print Nan values or null values after imutatio techiques
print(null_counts1)
#New barlot to show handling of null values
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(6, 6))
missing_data = final_df.isna()
sns.barplot(x=missing_data.columns, y=missing_data.sum())
plt.xlabel('Columns')
plt.ylabel('Count of Missing Values')
plt.title('Missing Values by Column')
plt.xticks(rotation=90)
plt.show()
"""Question 3: How the categorial textual features are handled?"""
def feature_plot(cols):
num_cols = len(cols)+1
num_rows = num_cols//2
num_cols_per_row = 2
if num_cols <=5:
fig = plt.figure(figsize=(12, 8))
else:
fig=plt.figure(figsize=(20,12))
for i, col in enumerate(cols):
ax = fig.add_subplot(num_rows, num_cols_per_row, i+1)
ax.hist(final_df[col], alpha=0.5)
ax.set_title(col)
plt.tight_layout()
plt.show()
feature_plot(categorical_cols) #plot for all string columns
feature_plot(numeric_cols) #plot for all string columns
from sklearn.preprocessing import LabelEncoder # transforming the text data to readable
def tranform_text(col): #using transform_text function for transforming
lb = LabelEncoder() #creating a label encoder named lb
final_df[col] = lb.fit_transform(final_df[col]) #using fit transform method to encode the values
print(col, ':', lb.classes_)
for col in categorical_cols:
tranform_text(col) #transform for all column features
final_df.head() #printing the top 5 rows
"""Do any of the variables have multicollinearity issues?"""
plt.figure(figsize=(10, 8)) # Adjust the figure size
sns.heatmap(final_df.corr(), cmap="RdYlBu", annot=True) # Mask correlations below a certain threshold and customize the colormap
plt.title("Correlation Matrix") # Add a title to the plot
plt.show()
#printing the heatmap
# weight,height,gender are identifed as best features
plt.figure(figsize=(10, 8)) # Adjust the figure size
sns.heatmap(final_df.corr(), cmap="RdYlBu", annot=True, mask=final_df.corr() < 0.5) # Maskig the correlatio values which are greater tha 0.5
plt.title("Correlation Matrix") # Add a title to the plot
plt.show()
"""FEATURE SCALING"""
from sklearn.preprocessing import StandardScaler #importing required libraries
from sklearn.model_selection import train_test_split
new_df = final_df.copy() #crearting a new dataframe
new_df.shape #Printing the dimensions of new data frame
def clf_data(x): #tranforming the label data
if x>30:
return 1 # HRD
else:
return 0 #LRD
new_df['Therapeutic Dose of Warfarin'] = new_df['Therapeutic Dose of Warfarin'].apply(clf_data) #transforming the finalized values to the dataframe
new_df['Therapeutic Dose of Warfarin']
X =new_df.drop('Therapeutic Dose of Warfarin',axis=1) #dropping the target variable
Y = new_df['Therapeutic Dose of Warfarin'] #took target variable as Y
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state=90) #split of the training and test dataset
print("X-train: ",x_train.shape) #printing the dimensions of all the split datasetts
print("X-test: ",x_test.shape)
print("Y-train ",y_train.shape)
print("Y-test",y_test.shape)
"""Question 4: How to identify anomaly data points/outliers (i.e., Cook's Distance, one-class SVM)"""
#CLASSIFICATION MODELS
from sklearn.svm import SVC #importing all the required packages from libraries
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,confusion_matrix #importing all the required metrics
import keras
from keras.models import Sequential
from keras.layers import Dense,Flatten # NN model
def modelBuilding(clf):
clf.fit(x_train,y_train) #fitting the classifierr clf on training data
y_pred = clf.predict(x_test) #predicting the lables for the test data
print('Accuracy-Test:',accuracy_score(y_test,y_pred)) #printing all the required metrics
print('Precision:',precision_score(y_test,y_pred))
print('Recall:',recall_score(y_test,y_pred))
print('F1-Score:',f1_score(y_test,y_pred))
print()
print()
print(clf)
cm = confusion_matrix(y_test, y_pred) #creating a confusion matrix for better analysis
sns.heatmap(cm, annot=True,fmt='.2f',cbar=False,cmap='Greens') #creating an heatmap for the confusion matrix
plt.show()
#displaying the heatmap
list1 = ['SVC()', 'LogisticRegression()', 'KNeighborsClassifier()', 'RandomForestClassifier()']
classifiers = []
for item in list1:
classifier = eval(item)
classifiers.append(classifier)
for i in classifiers:
modelBuilding(i)
#PARAMETERS TUNING
from sklearn.model_selection import RandomizedSearchCV #gridcv #performed paramtere tuning with randomsearch cv
def model_Tuning(model,params):
#randomised search
model = RandomizedSearchCV(model,param_distributions=params,cv=5,scoring='accuracy',n_jobs=-1)
model.fit(x_train,y_train)
#printing the best parameters
print('Best Params:',model.best_params_) # getting best params
print('Best Score:',model.best_score_) # getting best scores
return model.best_params_ #return the best parameters
# KNN
params = { 'n_neighbors' : [5,7,9,11],
'weights' : ['uniform','distance'],
'metric' : ['minkowski','euclidean','manhattan']}
#printing the best paramteres and the metrics
#tuning the knn parametyers
mt = model_Tuning(KNeighborsClassifier(),params)
knn = KNeighborsClassifier(n_neighbors=mt['n_neighbors'],weights=mt['weights'],metric =mt['metric'])
modelBuilding(knn) #printing the best paramters
params = {'C': [0.1, 1, 10, 100, 1000], 'gamma': [1, 0.1, 0.001, 0.0001], 'kernel': ['linear', 'rbf']}
mt = model_Tuning(SVC(), params)
svm = SVC(C=mt['C'], gamma=mt['gamma'], kernel=mt['kernel'])
modelBuilding(svm)
params = {'bootstrap': [True, False],
'max_depth': [10, 20, 25],
'min_samples_leaf': [1, 2, 3],
'max_features': ['auto', 'sqrt'],
'min_samples_split': [2, 5, 7],
'n_estimators': [200, 400, 600, 800, 1000]}
#printing the best paramteres and the metrics
mt = model_Tuning(RandomForestClassifier(),params) #random forest
#paramters tuning for random forest
rf = RandomForestClassifier(bootstrap=mt['bootstrap'],max_depth=mt['max_depth'],max_features=mt['max_features'],min_samples_leaf=mt['min_samples_leaf'],min_samples_split=mt['min_samples_split'],n_estimators=mt['n_estimators'])
modelBuilding(rf)
params = {
'penalty':['l1','l2'],
'C':[0.001,0.01,0.1,1,10,100],
}
#printing the best paramteres and the metrics
mt = model_Tuning(LogisticRegression(),params)
lr = LogisticRegression(penalty=mt['penalty'],C=mt['C'])
modelBuilding(lr)
"""Neural Network - Model"""
y_train = np.asarray(y_train).astype('float32').reshape((-1,1)) # reshaping the data
y_test = np.asarray(y_test).astype('float32').reshape((-1,1))
# Neural network
model = Sequential()
model.add(Dense(256, input_dim=10, activation='relu')) #input features as 10 function relu
model.add(Dense(128, activation='relu')) #neurons 128
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.summary()
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) #using binary crossentropy
history = model.fit(x_train, y_train, epochs=100, batch_size=28,validation_data=(x_test, y_test)) #fit the model on train data
# plotting test and train accuracy
plt.plot(history.history['accuracy']) #plotting the training accuracy
plt.plot(history.history['val_accuracy']) #plotting the validation accuracy
plt.title('Model accuracy') #setting the title for the plot
plt.ylabel('Accuracy') #Setting Y axis label
plt.xlabel('Epoch') #set x axis label
plt.legend(['Train', 'Test'], loc='upper right') #adding legend to the plot
plt.show() #displaying the plot
# plotting test and train loss
plt.plot(history.history['loss']) #plotting the training loss
plt.plot(history.history['val_loss']) #plotting the validation loss
plt.title('Model loss') #setting the name for the plot
plt.ylabel('Loss') #setting y -axis
plt.xlabel('Epoch') #setting x -axis
plt.legend(['Train', 'Test'], loc='upper right') #adding legend
plt.show()
""" Regression Models"""
X = final_df.drop('Therapeutic Dose of Warfarin',axis=1) #dropping target variable in the Xdataframe
Y = final_df['Therapeutic Dose of Warfarin'] #setting target variable in Y dataframe
std = StandardScaler() #standarization
X = std.fit_transform(X) # Feature scaling
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state=90) #splitting the
from sklearn.linear_model import LinearRegression # importing all the required packages
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error
def regressor(model): #defining a regression model
model.fit(x_train,y_train) #fit transform on traindata
y_pred = model.predict(x_test) #prediction on test data
#printing all the required metrics
print('R2-Score:',r2_score(y_test,y_pred))
print('MSE:',mean_squared_error(y_test,y_pred))
print('MAE:',mean_absolute_error(y_test,y_pred))
# Linear Regression
lir = LinearRegression()
regressor(lir)
# Random Forest
rfr = RandomForestRegressor()
regressor(rfr)
# Tree Regression
tree = DecisionTreeRegressor()
regressor(tree)
rf.feature_importances_ # selecting the features
new_df.columns #printing the features of the dataset
new11 = new_df.drop(columns=['Therapeutic Dose of Warfarin'])
plt.barh(new11.columns, rf.feature_importances_,color='g') #plot bar graph for all the features
feature_df = new_df[['Weight (kg)','Height (cm)','INR on Reported Therapeutic Dose of Warfarin','Age','VKORC1 genotype:','Therapeutic Dose of Warfarin']]
feature_df.head() # Top 5 features.
feature_df.shape #printing the dimensions of the dataframe
X = feature_df.drop('Therapeutic Dose of Warfarin',axis=1) #dropping the target variable for the new dataframe
Y = feature_df['Therapeutic Dose of Warfarin'] #Y has the target feature
std = StandardScaler()
X = std.fit_transform(X) # Feature scaling
x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state=90) #splitting the data into training and test sets
"""FINAL MODELS FOR PREDICTION"""
def model_buliding(model,filename): #Final mdoels and saving the models
clf = model.fit(x_train,y_train)
y_pred = model.predict(x_test)
#model_save
pickle.dump(model, open(filename, 'wb')) # Saving the model
print('Accuracy-Test:',accuracy_score(y_test,y_pred)) # accuracy
print('Precision:',precision_score(y_test,y_pred)) # Precision
print('Recall:',recall_score(y_test,y_pred)) # recall
print('F1-Score:',f1_score(y_test,y_pred)) # f1-score
cm = confusion_matrix(y_test, y_pred) # ploting confusion matrix
sns.heatmap(cm, annot=True,fmt='.2f',cbar=False,cmap='Greens')
plt.show()
model_buliding(lr,'lr.pkl') # logistic Regression
model_buliding(rf,'rf.pkl') # Random Forest
model_buliding(svm,'svm.pkl') # SVM
model_buliding(knn,'knn.pkl') # K_NN
y_train = np.asarray(y_train).astype('float32').reshape((-1,1))
y_test = np.asarray(y_test).astype('float32').reshape((-1,1))
y_test.shape #printing the dimensions
# Neural network
model = Sequential()
model.add(Dense(256, input_dim=5, activation='relu')) #took only top 5 features as input
model.add(Dense(148, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.summary()
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) #compiling the model
history = model.fit(x_train, y_train, epochs=100, batch_size=32,validation_data=(x_test, y_test)) #fitting the model
#plotting the model accuracy
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper right')
plt.show()
#plotting the model loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model LOSS')
plt.ylabel('LOSS')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper right')
plt.show()
y_pred = np.argmax(model.predict(x_test),axis=1) #generating the predictions using test data
print('Accuracy-Test:',accuracy_score(y_test,y_pred)) # accuracy
print('Precision:',precision_score(y_test,y_pred)) # Precision
print('Recall:',recall_score(y_test,y_pred)) # recall
print('F1-Score:',f1_score(y_test,y_pred)) # f1-score
cm = confusion_matrix(y_test, y_pred) # ploting confusion matrix
sns.heatmap(cm, annot=True,fmt='.2f',cbar=False,cmap='Greens')
plt.show()
"""Gradio Implementation"""
import gradio as gd
def model_prediction(weight, height,INR_on_Reported,age,VKROC1,model):
data = [[weight,height,INR_on_Reported,age,VKROC1]] # getting inputs
data = std.transform(data) # scaling the data
if model=='Logistic': # loading models
file = open("/content/lr.pkl",'rb')
model = pickle.load(file)
file.close()
prediction = model.predict(data)[0] #prediction
elif model=='KNN': #loading KNN model
file = open("/content/knn.pkl",'rb')
model = pickle.load(file)
file.close()
prediction = model.predict(data)[0]
elif model=='SVM': #loading SVM model
file = open("/content/svm.pkl",'rb')
model = pickle.load(file)
file.close()
prediction = model.predict(data)[0]
elif model=='RForest': #loading randomforest nodel
file = open("/content/rf.pkl",'rb')
model = pickle.load(file)
file.close()
prediction = model.predict(data)[0]
if prediction == 1: # Final Outputs
return 'HRD(1)'
else:
return 'LRD(0)'
return prediction
gd.Interface(fn=model_prediction,inputs=["text", "text","text","text","text",gd.Dropdown(['Logistic','KNN','SVM','RForest'])], outputs=["text"]).launch()
feature_df.head(3)