Spaces:

shyamsankeerth
/

Final_project

Runtime error

App Files Files Community

Final_project / app.py

shyamsankeerth

Update app.py

200f943 about 3 years ago

raw

history blame contribute delete

17 kB

	# -- coding: utf-8 --
	"""Final Project Warfarin.ipynb

	Automatically generated by Colaboratory.

	Original file is located at
	https://colab.research.google.com/drive/14SbhVS2m0dw-57GiRSq4rwt023z32HlI

	Question 1: How the data is preprocessed? How the data is loaded into Python to start the ML pipeline?

	---
	"""

	# importing libraries

	import pandas as pd
	import numpy as np
	import seaborn as sns
	import matplotlib.pyplot as plt
	import pickle

	import warnings
	warnings.filterwarnings('ignore')
	from datasets import load_dataset

	df= load_dataset("shyamsankeerth/dataset1")
	df.head() #print the first five rows of the dataset

	df.dtypes #printing data types of the columns or features

	df.shape #print dimensions of the dataframe

	df.columns #printing column names of the dataframe

	df.drop(['Cyp2C9 genotypes','Target INR'],axis=1,inplace=True)#dropping the columns
	df.shape #printing the dimensions of the column

	df['VKORC1 genotype:'] = df['VKORC1 genotype: -1639 G>A (3673); chr16:31015190; rs9923231; C/T'] # renaming the columns
	df.drop('VKORC1 genotype: -1639 G>A (3673); chr16:31015190; rs9923231; C/T',axis=1,inplace=True) #removing the column



	null_counts=df.isna().sum() #print Nan values or null values
	print(null_counts)

	plt.figure(figsize=(6, 6)) # Set the figure size
	plt.bar(null_counts.index, null_counts.values, color='green') # Plot the bars
	plt.xlabel('Columns') # Set the x-axis label
	plt.ylabel('Null Values Count') # Set the y-axis label
	plt.title('Null Values in DataFrame') # Set the title
	plt.xticks(rotation=90) # Rotate x-axis labels if needed
	plt.show() #

	numeric_cols = [] # Separate the data into numeric_cols and categorical_cols for filling null values.
	categorical_cols = []

	for col in df.columns:
	if df[col].dtype in ['float64']: # Consider int64 and float64 columns as numeric
	numeric_cols.append(col)
	elif df[col].dtype == 'object': # Consider object columns as categorical
	categorical_cols.append(col)
	else:
	print(col)

	categorical_cols #printing the srting columns

	numeric_cols

	s_df = df[categorical_cols]
	s_df.head() #printing the top 5 rows of string columns

	n_df = df[numeric_cols] #new dataframe for float columns
	n_df.head() #printing top five rows of float columns

	from sklearn.impute import SimpleImputer # for filling the nul values



	from sklearn.experimental import enable_iterative_imputer
	from sklearn.impute import IterativeImputer

	imputer = IterativeImputer()
	nums = imputer.fit_transform(n_df)
	print(nums)

	imputer = SimpleImputer(strategy="most_frequent") #filling most frequent values
	s = imputer.fit_transform(s_df)
	s

	f_1 = pd.DataFrame(s,columns=s_df.columns)
	f_2 = pd.DataFrame(nums,columns=n_df.columns)

	final_df = pd.concat([f_2,f_1],axis=1) # combining both str,float columns and make a new data frame
	print(final_df)
	print(final_df.shape)

	null_counts1=final_df.isna().sum() #print Nan values or null values after imutatio techiques
	print(null_counts1)

	#New barlot to show handling of null values
	import seaborn as sns
	import matplotlib.pyplot as plt

	plt.figure(figsize=(6, 6))
	missing_data = final_df.isna()
	sns.barplot(x=missing_data.columns, y=missing_data.sum())
	plt.xlabel('Columns')
	plt.ylabel('Count of Missing Values')
	plt.title('Missing Values by Column')
	plt.xticks(rotation=90)
	plt.show()



	"""Question 3: How the categorial textual features are handled?"""

	def feature_plot(cols):
	num_cols = len(cols)+1
	num_rows = num_cols//2
	num_cols_per_row = 2
	if num_cols <=5:
	fig = plt.figure(figsize=(12, 8))
	else:
	fig=plt.figure(figsize=(20,12))

	for i, col in enumerate(cols):
	ax = fig.add_subplot(num_rows, num_cols_per_row, i+1)
	ax.hist(final_df[col], alpha=0.5)
	ax.set_title(col)

	plt.tight_layout()
	plt.show()

	feature_plot(categorical_cols) #plot for all string columns

	feature_plot(numeric_cols) #plot for all string columns

	from sklearn.preprocessing import LabelEncoder # transforming the text data to readable

	def tranform_text(col): #using transform_text function for transforming
	lb = LabelEncoder() #creating a label encoder named lb
	final_df[col] = lb.fit_transform(final_df[col]) #using fit transform method to encode the values
	print(col, ':', lb.classes_)

	for col in categorical_cols:
	tranform_text(col) #transform for all column features

	final_df.head() #printing the top 5 rows

	"""Do any of the variables have multicollinearity issues?"""

	plt.figure(figsize=(10, 8)) # Adjust the figure size
	sns.heatmap(final_df.corr(), cmap="RdYlBu", annot=True) # Mask correlations below a certain threshold and customize the colormap
	plt.title("Correlation Matrix") # Add a title to the plot
	plt.show()

	#printing the heatmap
	# weight,height,gender are identifed as best features

	plt.figure(figsize=(10, 8)) # Adjust the figure size
	sns.heatmap(final_df.corr(), cmap="RdYlBu", annot=True, mask=final_df.corr() < 0.5) # Maskig the correlatio values which are greater tha 0.5
	plt.title("Correlation Matrix") # Add a title to the plot
	plt.show()

	"""FEATURE SCALING"""

	from sklearn.preprocessing import StandardScaler #importing required libraries
	from sklearn.model_selection import train_test_split

	new_df = final_df.copy() #crearting a new dataframe

	new_df.shape #Printing the dimensions of new data frame

	def clf_data(x): #tranforming the label data
	if x>30:
	return 1 # HRD
	else:
	return 0 #LRD

	new_df['Therapeutic Dose of Warfarin'] = new_df['Therapeutic Dose of Warfarin'].apply(clf_data) #transforming the finalized values to the dataframe

	new_df['Therapeutic Dose of Warfarin']

	X =new_df.drop('Therapeutic Dose of Warfarin',axis=1) #dropping the target variable
	Y = new_df['Therapeutic Dose of Warfarin'] #took target variable as Y

	x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state=90) #split of the training and test dataset

	print("X-train: ",x_train.shape) #printing the dimensions of all the split datasetts
	print("X-test: ",x_test.shape)
	print("Y-train ",y_train.shape)
	print("Y-test",y_test.shape)

	"""Question 4: How to identify anomaly data points/outliers (i.e., Cook's Distance, one-class SVM)"""













	#CLASSIFICATION MODELS

	from sklearn.svm import SVC #importing all the required packages from libraries
	from sklearn.linear_model import LogisticRegression
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.neighbors import KNeighborsClassifier
	from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,confusion_matrix #importing all the required metrics
	import keras
	from keras.models import Sequential
	from keras.layers import Dense,Flatten # NN model

	def modelBuilding(clf):

	clf.fit(x_train,y_train) #fitting the classifierr clf on training data
	y_pred = clf.predict(x_test) #predicting the lables for the test data

	print('Accuracy-Test:',accuracy_score(y_test,y_pred)) #printing all the required metrics
	print('Precision:',precision_score(y_test,y_pred))
	print('Recall:',recall_score(y_test,y_pred))
	print('F1-Score:',f1_score(y_test,y_pred))

	print()
	print()
	print(clf)
	cm = confusion_matrix(y_test, y_pred) #creating a confusion matrix for better analysis
	sns.heatmap(cm, annot=True,fmt='.2f',cbar=False,cmap='Greens') #creating an heatmap for the confusion matrix

	plt.show()
	#displaying the heatmap

	list1 = ['SVC()', 'LogisticRegression()', 'KNeighborsClassifier()', 'RandomForestClassifier()']

	classifiers = []

	for item in list1:
	classifier = eval(item)
	classifiers.append(classifier)

	for i in classifiers:
	modelBuilding(i)

	#PARAMETERS TUNING













	from sklearn.model_selection import RandomizedSearchCV #gridcv #performed paramtere tuning with randomsearch cv

	def model_Tuning(model,params):
	#randomised search
	model = RandomizedSearchCV(model,param_distributions=params,cv=5,scoring='accuracy',n_jobs=-1)
	model.fit(x_train,y_train)

	#printing the best parameters
	print('Best Params:',model.best_params_) # getting best params
	print('Best Score:',model.best_score_) # getting best scores
	return model.best_params_ #return the best parameters

	# KNN
	params = { 'n_neighbors' : [5,7,9,11],
	'weights' : ['uniform','distance'],
	'metric' : ['minkowski','euclidean','manhattan']}

	#printing the best paramteres and the metrics
	#tuning the knn parametyers
	mt = model_Tuning(KNeighborsClassifier(),params)
	knn = KNeighborsClassifier(n_neighbors=mt['n_neighbors'],weights=mt['weights'],metric =mt['metric'])

	modelBuilding(knn) #printing the best paramters

	params = {'C': [0.1, 1, 10, 100, 1000], 'gamma': [1, 0.1, 0.001, 0.0001], 'kernel': ['linear', 'rbf']}

	mt = model_Tuning(SVC(), params)
	svm = SVC(C=mt['C'], gamma=mt['gamma'], kernel=mt['kernel'])
	modelBuilding(svm)

	params = {'bootstrap': [True, False],
	'max_depth': [10, 20, 25],
	'min_samples_leaf': [1, 2, 3],
	'max_features': ['auto', 'sqrt'],
	'min_samples_split': [2, 5, 7],
	'n_estimators': [200, 400, 600, 800, 1000]}

	#printing the best paramteres and the metrics
	mt = model_Tuning(RandomForestClassifier(),params) #random forest
	#paramters tuning for random forest

	rf = RandomForestClassifier(bootstrap=mt['bootstrap'],max_depth=mt['max_depth'],max_features=mt['max_features'],min_samples_leaf=mt['min_samples_leaf'],min_samples_split=mt['min_samples_split'],n_estimators=mt['n_estimators'])
	modelBuilding(rf)

	params = {
	'penalty':['l1','l2'],
	'C':[0.001,0.01,0.1,1,10,100],

	}

	#printing the best paramteres and the metrics
	mt = model_Tuning(LogisticRegression(),params)
	lr = LogisticRegression(penalty=mt['penalty'],C=mt['C'])
	modelBuilding(lr)

	"""Neural Network - Model"""

	y_train = np.asarray(y_train).astype('float32').reshape((-1,1)) # reshaping the data
	y_test = np.asarray(y_test).astype('float32').reshape((-1,1))

	# Neural network
	model = Sequential()
	model.add(Dense(256, input_dim=10, activation='relu')) #input features as 10 function relu
	model.add(Dense(128, activation='relu')) #neurons 128
	model.add(Flatten())
	model.add(Dense(1, activation='sigmoid'))
	model.summary()

	model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) #using binary crossentropy

	history = model.fit(x_train, y_train, epochs=100, batch_size=28,validation_data=(x_test, y_test)) #fit the model on train data

	# plotting test and train accuracy
	plt.plot(history.history['accuracy']) #plotting the training accuracy
	plt.plot(history.history['val_accuracy']) #plotting the validation accuracy
	plt.title('Model accuracy') #setting the title for the plot
	plt.ylabel('Accuracy') #Setting Y axis label
	plt.xlabel('Epoch') #set x axis label
	plt.legend(['Train', 'Test'], loc='upper right') #adding legend to the plot
	plt.show() #displaying the plot

	# plotting test and train loss
	plt.plot(history.history['loss']) #plotting the training loss
	plt.plot(history.history['val_loss']) #plotting the validation loss
	plt.title('Model loss') #setting the name for the plot
	plt.ylabel('Loss') #setting y -axis
	plt.xlabel('Epoch') #setting x -axis
	plt.legend(['Train', 'Test'], loc='upper right') #adding legend
	plt.show()

	""" Regression Models"""

	X = final_df.drop('Therapeutic Dose of Warfarin',axis=1) #dropping target variable in the Xdataframe
	Y = final_df['Therapeutic Dose of Warfarin'] #setting target variable in Y dataframe

	std = StandardScaler() #standarization
	X = std.fit_transform(X) # Feature scaling

	x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state=90) #splitting the

	from sklearn.linear_model import LinearRegression # importing all the required packages
	from sklearn.tree import DecisionTreeRegressor
	from sklearn.ensemble import RandomForestRegressor
	from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error

	def regressor(model): #defining a regression model

	model.fit(x_train,y_train) #fit transform on traindata
	y_pred = model.predict(x_test) #prediction on test data
	#printing all the required metrics
	print('R2-Score:',r2_score(y_test,y_pred))
	print('MSE:',mean_squared_error(y_test,y_pred))
	print('MAE:',mean_absolute_error(y_test,y_pred))

	# Linear Regression

	lir = LinearRegression()

	regressor(lir)

	# Random Forest

	rfr = RandomForestRegressor()

	regressor(rfr)

	# Tree Regression

	tree = DecisionTreeRegressor()

	regressor(tree)

	rf.feature_importances_ # selecting the features

	new_df.columns #printing the features of the dataset

	new11 = new_df.drop(columns=['Therapeutic Dose of Warfarin'])

	plt.barh(new11.columns, rf.feature_importances_,color='g') #plot bar graph for all the features

	feature_df = new_df[['Weight (kg)','Height (cm)','INR on Reported Therapeutic Dose of Warfarin','Age','VKORC1 genotype:','Therapeutic Dose of Warfarin']]
	feature_df.head() # Top 5 features.

	feature_df.shape #printing the dimensions of the dataframe

	X = feature_df.drop('Therapeutic Dose of Warfarin',axis=1) #dropping the target variable for the new dataframe
	Y = feature_df['Therapeutic Dose of Warfarin'] #Y has the target feature

	std = StandardScaler()
	X = std.fit_transform(X) # Feature scaling

	x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state=90) #splitting the data into training and test sets

	"""FINAL MODELS FOR PREDICTION"""

	def model_buliding(model,filename): #Final mdoels and saving the models
	clf = model.fit(x_train,y_train)
	y_pred = model.predict(x_test)

	#model_save
	pickle.dump(model, open(filename, 'wb')) # Saving the model


	print('Accuracy-Test:',accuracy_score(y_test,y_pred)) # accuracy
	print('Precision:',precision_score(y_test,y_pred)) # Precision
	print('Recall:',recall_score(y_test,y_pred)) # recall
	print('F1-Score:',f1_score(y_test,y_pred)) # f1-score
	cm = confusion_matrix(y_test, y_pred) # ploting confusion matrix
	sns.heatmap(cm, annot=True,fmt='.2f',cbar=False,cmap='Greens')
	plt.show()

	model_buliding(lr,'lr.pkl') # logistic Regression

	model_buliding(rf,'rf.pkl') # Random Forest

	model_buliding(svm,'svm.pkl') # SVM

	model_buliding(knn,'knn.pkl') # K_NN

	y_train = np.asarray(y_train).astype('float32').reshape((-1,1))
	y_test = np.asarray(y_test).astype('float32').reshape((-1,1))

	y_test.shape #printing the dimensions

	# Neural network
	model = Sequential()
	model.add(Dense(256, input_dim=5, activation='relu')) #took only top 5 features as input
	model.add(Dense(148, activation='relu'))
	model.add(Dense(64, activation='relu'))
	model.add(Flatten())
	model.add(Dense(1, activation='sigmoid'))
	model.summary()

	model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) #compiling the model

	history = model.fit(x_train, y_train, epochs=100, batch_size=32,validation_data=(x_test, y_test)) #fitting the model

	#plotting the model accuracy
	plt.plot(history.history['accuracy'])
	plt.plot(history.history['val_accuracy'])
	plt.title('Model accuracy')
	plt.ylabel('Accuracy')
	plt.xlabel('Epoch')
	plt.legend(['Train', 'Test'], loc='upper right')
	plt.show()

	#plotting the model loss
	plt.plot(history.history['loss'])
	plt.plot(history.history['val_loss'])
	plt.title('Model LOSS')
	plt.ylabel('LOSS')
	plt.xlabel('Epoch')
	plt.legend(['Train', 'Test'], loc='upper right')
	plt.show()

	y_pred = np.argmax(model.predict(x_test),axis=1) #generating the predictions using test data

	print('Accuracy-Test:',accuracy_score(y_test,y_pred)) # accuracy
	print('Precision:',precision_score(y_test,y_pred)) # Precision
	print('Recall:',recall_score(y_test,y_pred)) # recall
	print('F1-Score:',f1_score(y_test,y_pred)) # f1-score
	cm = confusion_matrix(y_test, y_pred) # ploting confusion matrix
	sns.heatmap(cm, annot=True,fmt='.2f',cbar=False,cmap='Greens')
	plt.show()

	"""Gradio Implementation"""


	import gradio as gd

	def model_prediction(weight, height,INR_on_Reported,age,VKROC1,model):
	data = [[weight,height,INR_on_Reported,age,VKROC1]] # getting inputs

	data = std.transform(data) # scaling the data

	if model=='Logistic': # loading models
	file = open("/content/lr.pkl",'rb')
	model = pickle.load(file)
	file.close()
	prediction = model.predict(data)[0] #prediction


	elif model=='KNN': #loading KNN model
	file = open("/content/knn.pkl",'rb')
	model = pickle.load(file)
	file.close()
	prediction = model.predict(data)[0]

	elif model=='SVM': #loading SVM model
	file = open("/content/svm.pkl",'rb')
	model = pickle.load(file)
	file.close()
	prediction = model.predict(data)[0]

	elif model=='RForest': #loading randomforest nodel
	file = open("/content/rf.pkl",'rb')
	model = pickle.load(file)
	file.close()
	prediction = model.predict(data)[0]

	if prediction == 1: # Final Outputs
	return 'HRD(1)'
	else:
	return 'LRD(0)'


	return prediction



	gd.Interface(fn=model_prediction,inputs=["text", "text","text","text","text",gd.Dropdown(['Logistic','KNN','SVM','RForest'])], outputs=["text"]).launch()

	feature_df.head(3)