# -*- coding: utf-8 -*-
"""Final Project Warfarin.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/14SbhVS2m0dw-57GiRSq4rwt023z32HlI

Question 1: How the data is preprocessed? How the data is loaded into Python to start the ML pipeline?

---
"""

# importing libraries

import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import pickle

import warnings
warnings.filterwarnings('ignore')
from datasets import load_dataset

df= load_dataset("shyamsankeerth/dataset1")
df.head() #print the first five rows of the dataset

df.dtypes #printing data types of the columns or features

df.shape #print dimensions of the dataframe

df.columns #printing column names of the dataframe

df.drop(['Cyp2C9 genotypes','Target INR'],axis=1,inplace=True)#dropping the columns 
df.shape #printing the dimensions of the column

df['VKORC1 genotype:'] = df['VKORC1 genotype: -1639 G>A (3673); chr16:31015190; rs9923231; C/T'] # renaming the columns
df.drop('VKORC1 genotype: -1639 G>A (3673); chr16:31015190; rs9923231; C/T',axis=1,inplace=True) #removing the column


null_counts=df.isna().sum()  #print Nan values or null values
print(null_counts)

plt.figure(figsize=(6, 6))  # Set the figure size
plt.bar(null_counts.index, null_counts.values, color='green')  # Plot the bars
plt.xlabel('Columns')  # Set the x-axis label
plt.ylabel('Null Values Count')  # Set the y-axis label
plt.title('Null Values in DataFrame')  # Set the title
plt.xticks(rotation=90)  # Rotate x-axis labels if needed
plt.show()  #

numeric_cols = []  # Separate the data into numeric_cols and categorical_cols for filling null values.
categorical_cols = []

for col in df.columns:
    if df[col].dtype in ['float64']:  # Consider int64 and float64 columns as numeric
        numeric_cols.append(col)
    elif df[col].dtype == 'object':  # Consider object columns as categorical
        categorical_cols.append(col)
    else:
        print(col)

categorical_cols #printing the srting columns

numeric_cols

s_df = df[categorical_cols]
s_df.head() #printing the top 5 rows of string columns

n_df = df[numeric_cols] #new dataframe for float columns 
n_df.head() #printing top five rows of float columns

from sklearn.impute import SimpleImputer # for filling the nul values


from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

imputer = IterativeImputer()
nums = imputer.fit_transform(n_df)
print(nums)

imputer = SimpleImputer(strategy="most_frequent") #filling most frequent values
s = imputer.fit_transform(s_df)
s

f_1 = pd.DataFrame(s,columns=s_df.columns)
f_2 = pd.DataFrame(nums,columns=n_df.columns)

final_df = pd.concat([f_2,f_1],axis=1) # combining both str,float columns and make a new data frame
print(final_df)
print(final_df.shape)

null_counts1=final_df.isna().sum()  #print Nan values or null values after imutatio techiques
print(null_counts1)

#New barlot to show handling of null values 
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(6, 6))
missing_data = final_df.isna()
sns.barplot(x=missing_data.columns, y=missing_data.sum())
plt.xlabel('Columns')
plt.ylabel('Count of Missing Values')
plt.title('Missing Values by Column')
plt.xticks(rotation=90)
plt.show()


"""Question 3: How the categorial textual features are handled?"""

def feature_plot(cols):
    num_cols = len(cols)+1
    num_rows = num_cols//2
    num_cols_per_row = 2
    if num_cols <=5:
      fig = plt.figure(figsize=(12, 8))
    else:
      fig=plt.figure(figsize=(20,12))

    for i, col in enumerate(cols):
        ax = fig.add_subplot(num_rows, num_cols_per_row, i+1)
        ax.hist(final_df[col], alpha=0.5)
        ax.set_title(col)

    plt.tight_layout()
    plt.show()

feature_plot(categorical_cols) #plot for all string columns

feature_plot(numeric_cols) #plot for all string columns

from sklearn.preprocessing import LabelEncoder # transforming the text data to readable

def tranform_text(col): #using transform_text function for transforming 
  lb = LabelEncoder() #creating a label encoder named lb
  final_df[col] = lb.fit_transform(final_df[col]) #using fit transform method to encode the values 
  print(col, ':', lb.classes_)

for col in categorical_cols:  
  tranform_text(col) #transform for all column features

final_df.head() #printing the top 5 rows

"""Do any of the variables have multicollinearity issues?"""

plt.figure(figsize=(10, 8))  # Adjust the figure size
sns.heatmap(final_df.corr(), cmap="RdYlBu", annot=True)  # Mask correlations below a certain threshold and customize the colormap
plt.title("Correlation Matrix")  # Add a title to the plot
plt.show()

 #printing the heatmap
# weight,height,gender are identifed as best features

plt.figure(figsize=(10, 8))  # Adjust the figure size
sns.heatmap(final_df.corr(), cmap="RdYlBu", annot=True, mask=final_df.corr() < 0.5)  # Maskig the correlatio values which are greater tha 0.5
plt.title("Correlation Matrix")  # Add a title to the plot
plt.show()

"""FEATURE SCALING"""

from sklearn.preprocessing import StandardScaler #importing required libraries
from sklearn.model_selection import train_test_split

new_df = final_df.copy() #crearting a new dataframe

new_df.shape #Printing the dimensions of new data frame

def clf_data(x): #tranforming the label data
  if x>30:
    return 1 # HRD
  else:
    return 0 #LRD

new_df['Therapeutic Dose of Warfarin'] = new_df['Therapeutic Dose of Warfarin'].apply(clf_data) #transforming the finalized values to the dataframe

new_df['Therapeutic Dose of Warfarin']

X =new_df.drop('Therapeutic Dose of Warfarin',axis=1) #dropping the target variable
Y = new_df['Therapeutic Dose of Warfarin']  #took target variable as Y

x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state=90) #split of the training and test dataset

print("X-train: ",x_train.shape) #printing the dimensions of all the split datasetts
print("X-test: ",x_test.shape)
print("Y-train ",y_train.shape)
print("Y-test",y_test.shape)

"""Question 4: How to identify anomaly data points/outliers (i.e., Cook's Distance, one-class SVM)"""


#CLASSIFICATION MODELS

from sklearn.svm import SVC #importing all the required packages from libraries
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,confusion_matrix #importing all the required metrics
import keras
from keras.models import Sequential
from keras.layers import Dense,Flatten # NN model

def modelBuilding(clf):

  clf.fit(x_train,y_train) #fitting the classifierr clf on training data 
  y_pred = clf.predict(x_test) #predicting the lables for the test data

  print('Accuracy-Test:',accuracy_score(y_test,y_pred)) #printing all the required metrics 
  print('Precision:',precision_score(y_test,y_pred))
  print('Recall:',recall_score(y_test,y_pred))
  print('F1-Score:',f1_score(y_test,y_pred))

  print()
  print()
  print(clf)
  cm = confusion_matrix(y_test, y_pred) #creating a confusion matrix for better analysis
  sns.heatmap(cm, annot=True,fmt='.2f',cbar=False,cmap='Greens') #creating an heatmap for the confusion matrix

  plt.show()
   #displaying the heatmap

list1 = ['SVC()', 'LogisticRegression()', 'KNeighborsClassifier()', 'RandomForestClassifier()']

classifiers = []

for item in list1:
    classifier = eval(item)
    classifiers.append(classifier)

for i in classifiers:
  modelBuilding(i)

#PARAMETERS TUNING


from sklearn.model_selection import RandomizedSearchCV #gridcv #performed paramtere tuning with randomsearch cv

def model_Tuning(model,params):
#randomised search 
  model = RandomizedSearchCV(model,param_distributions=params,cv=5,scoring='accuracy',n_jobs=-1) 
  model.fit(x_train,y_train)

#printing the best parameters
  print('Best Params:',model.best_params_) # getting best params
  print('Best Score:',model.best_score_) # getting best scores
  return model.best_params_ #return the best parameters

# KNN 
params = { 'n_neighbors' : [5,7,9,11],
               'weights' : ['uniform','distance'],
               'metric' : ['minkowski','euclidean','manhattan']}

#printing the best paramteres and the metrics
#tuning the knn parametyers
mt = model_Tuning(KNeighborsClassifier(),params) 
knn = KNeighborsClassifier(n_neighbors=mt['n_neighbors'],weights=mt['weights'],metric =mt['metric'])

modelBuilding(knn) #printing the best paramters

params = {'C': [0.1, 1, 10, 100, 1000], 'gamma': [1, 0.1, 0.001, 0.0001], 'kernel': ['linear', 'rbf']}

mt = model_Tuning(SVC(), params)
svm = SVC(C=mt['C'], gamma=mt['gamma'], kernel=mt['kernel'])
modelBuilding(svm)

params = {'bootstrap': [True, False],
 'max_depth': [10, 20, 25],
 'min_samples_leaf': [1, 2, 3],
 'max_features': ['auto', 'sqrt'],
 'min_samples_split': [2, 5, 7],
 'n_estimators': [200, 400, 600, 800, 1000]}

#printing the best paramteres and the metrics
mt = model_Tuning(RandomForestClassifier(),params) #random forest
#paramters tuning for random forest 

rf = RandomForestClassifier(bootstrap=mt['bootstrap'],max_depth=mt['max_depth'],max_features=mt['max_features'],min_samples_leaf=mt['min_samples_leaf'],min_samples_split=mt['min_samples_split'],n_estimators=mt['n_estimators'])
modelBuilding(rf)

params = {
    'penalty':['l1','l2'],
    'C':[0.001,0.01,0.1,1,10,100],
    
}

#printing the best paramteres and the metrics
mt = model_Tuning(LogisticRegression(),params)
lr = LogisticRegression(penalty=mt['penalty'],C=mt['C'])
modelBuilding(lr)

"""Neural Network  - Model"""

y_train = np.asarray(y_train).astype('float32').reshape((-1,1)) # reshaping the data
y_test = np.asarray(y_test).astype('float32').reshape((-1,1))

# Neural network
model = Sequential()
model.add(Dense(256, input_dim=10, activation='relu')) #input features as 10 function relu 
model.add(Dense(128, activation='relu')) #neurons 128
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.summary()

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) #using binary crossentropy

history = model.fit(x_train, y_train, epochs=100, batch_size=28,validation_data=(x_test, y_test)) #fit the model on train data

# plotting test and train accuracy
plt.plot(history.history['accuracy']) #plotting the training accuracy 
plt.plot(history.history['val_accuracy']) #plotting the validation accuracy 
plt.title('Model accuracy')  #setting the title for the plot 
plt.ylabel('Accuracy') #Setting Y axis label 
plt.xlabel('Epoch') #set x axis label 
plt.legend(['Train', 'Test'], loc='upper right') #adding legend to the plot 
plt.show() #displaying the plot

# plotting test and train loss
plt.plot(history.history['loss'])  #plotting the training loss
plt.plot(history.history['val_loss'])  #plotting the validation loss
plt.title('Model loss')  #setting the name for the plot 
plt.ylabel('Loss')  #setting y -axis
plt.xlabel('Epoch')  #setting x -axis
plt.legend(['Train', 'Test'], loc='upper right')  #adding legend
plt.show()

""" Regression Models"""

X = final_df.drop('Therapeutic Dose of Warfarin',axis=1) #dropping target variable in the Xdataframe
Y = final_df['Therapeutic Dose of Warfarin'] #setting target variable in Y dataframe

std = StandardScaler() #standarization
X = std.fit_transform(X) # Feature scaling

x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state=90) #splitting the

from sklearn.linear_model import LinearRegression # importing all the required packages
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error

def regressor(model): #defining a regression model

  model.fit(x_train,y_train) #fit transform on traindata 
  y_pred = model.predict(x_test) #prediction on test data
#printing all the required metrics 
  print('R2-Score:',r2_score(y_test,y_pred)) 
  print('MSE:',mean_squared_error(y_test,y_pred))
  print('MAE:',mean_absolute_error(y_test,y_pred))

# Linear Regression

lir = LinearRegression()

regressor(lir)

# Random Forest

rfr = RandomForestRegressor()

regressor(rfr)

# Tree Regression

tree = DecisionTreeRegressor()

regressor(tree)

rf.feature_importances_ # selecting the features

new_df.columns #printing the features of the dataset

new11 = new_df.drop(columns=['Therapeutic Dose of Warfarin'])

plt.barh(new11.columns, rf.feature_importances_,color='g') #plot bar graph for all the features

feature_df = new_df[['Weight (kg)','Height (cm)','INR on Reported Therapeutic Dose of Warfarin','Age','VKORC1 genotype:','Therapeutic Dose of Warfarin']]
feature_df.head() # Top 5 features.

feature_df.shape #printing the dimensions of the dataframe

X = feature_df.drop('Therapeutic Dose of Warfarin',axis=1) #dropping the target variable for the new dataframe
Y = feature_df['Therapeutic Dose of Warfarin'] #Y has the target feature

std = StandardScaler()
X = std.fit_transform(X) # Feature scaling

x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state=90) #splitting the data into training and test sets

"""FINAL MODELS FOR PREDICTION"""

def model_buliding(model,filename): #Final mdoels and saving the models
  clf = model.fit(x_train,y_train)
  y_pred = model.predict(x_test)
  
   #model_save
  pickle.dump(model, open(filename, 'wb')) # Saving the model
  

  print('Accuracy-Test:',accuracy_score(y_test,y_pred)) # accuracy
  print('Precision:',precision_score(y_test,y_pred)) # Precision
  print('Recall:',recall_score(y_test,y_pred)) # recall
  print('F1-Score:',f1_score(y_test,y_pred)) # f1-score
  cm = confusion_matrix(y_test, y_pred) # ploting confusion matrix
  sns.heatmap(cm, annot=True,fmt='.2f',cbar=False,cmap='Greens')
  plt.show()

model_buliding(lr,'lr.pkl') # logistic Regression

model_buliding(rf,'rf.pkl') # Random Forest

model_buliding(svm,'svm.pkl') # SVM

model_buliding(knn,'knn.pkl') # K_NN

y_train = np.asarray(y_train).astype('float32').reshape((-1,1))
y_test = np.asarray(y_test).astype('float32').reshape((-1,1))

y_test.shape #printing the dimensions

# Neural network
model = Sequential()
model.add(Dense(256, input_dim=5, activation='relu')) #took only top 5 features as input 
model.add(Dense(148, activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.summary()

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) #compiling the model

history = model.fit(x_train, y_train, epochs=100, batch_size=32,validation_data=(x_test, y_test)) #fitting the model

#plotting the model accuracy 
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper right')
plt.show()

#plotting the model loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model LOSS')
plt.ylabel('LOSS')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper right')
plt.show()

y_pred = np.argmax(model.predict(x_test),axis=1) #generating the predictions using test data

print('Accuracy-Test:',accuracy_score(y_test,y_pred)) # accuracy
print('Precision:',precision_score(y_test,y_pred)) # Precision
print('Recall:',recall_score(y_test,y_pred)) # recall
print('F1-Score:',f1_score(y_test,y_pred)) # f1-score
cm = confusion_matrix(y_test, y_pred) # ploting confusion matrix
sns.heatmap(cm, annot=True,fmt='.2f',cbar=False,cmap='Greens')
plt.show()

"""Gradio Implementation"""


import gradio as gd

def model_prediction(weight, height,INR_on_Reported,age,VKROC1,model):
    data = [[weight,height,INR_on_Reported,age,VKROC1]] # getting inputs

    data = std.transform(data) # scaling the data

    if model=='Logistic': # loading models
      file = open("/content/lr.pkl",'rb')
      model = pickle.load(file)
      file.close()
      prediction = model.predict(data)[0] #prediction
    

    elif model=='KNN': #loading KNN model
      file = open("/content/knn.pkl",'rb')
      model = pickle.load(file)
      file.close()
      prediction = model.predict(data)[0]

    elif model=='SVM': #loading SVM model
      file = open("/content/svm.pkl",'rb')
      model = pickle.load(file)
      file.close()
      prediction = model.predict(data)[0]

    elif model=='RForest': #loading randomforest nodel
      file = open("/content/rf.pkl",'rb')
      model = pickle.load(file)
      file.close()
      prediction = model.predict(data)[0]

    if prediction == 1: # Final Outputs
      return 'HRD(1)'
    else:
      return 'LRD(0)'


    return prediction
    
    
gd.Interface(fn=model_prediction,inputs=["text", "text","text","text","text",gd.Dropdown(['Logistic','KNN','SVM','RForest'])], outputs=["text"]).launch()

feature_df.head(3)