Spaces:
Runtime error
Runtime error
| # -*- coding: utf-8 -*- | |
| """Final Project Warfarin.ipynb | |
| Automatically generated by Colaboratory. | |
| Original file is located at | |
| https://colab.research.google.com/drive/14SbhVS2m0dw-57GiRSq4rwt023z32HlI | |
| Question 1: How the data is preprocessed? How the data is loaded into Python to start the ML pipeline? | |
| --- | |
| """ | |
| # importing libraries | |
| import pandas as pd | |
| import numpy as np | |
| import seaborn as sns | |
| import matplotlib.pyplot as plt | |
| import pickle | |
| import warnings | |
| warnings.filterwarnings('ignore') | |
| from datasets import load_dataset | |
| df= load_dataset("shyamsankeerth/dataset1") | |
| df.head() #print the first five rows of the dataset | |
| df.dtypes #printing data types of the columns or features | |
| df.shape #print dimensions of the dataframe | |
| df.columns #printing column names of the dataframe | |
| df.drop(['Cyp2C9 genotypes','Target INR'],axis=1,inplace=True)#dropping the columns | |
| df.shape #printing the dimensions of the column | |
| df['VKORC1 genotype:'] = df['VKORC1 genotype: -1639 G>A (3673); chr16:31015190; rs9923231; C/T'] # renaming the columns | |
| df.drop('VKORC1 genotype: -1639 G>A (3673); chr16:31015190; rs9923231; C/T',axis=1,inplace=True) #removing the column | |
| null_counts=df.isna().sum() #print Nan values or null values | |
| print(null_counts) | |
| plt.figure(figsize=(6, 6)) # Set the figure size | |
| plt.bar(null_counts.index, null_counts.values, color='green') # Plot the bars | |
| plt.xlabel('Columns') # Set the x-axis label | |
| plt.ylabel('Null Values Count') # Set the y-axis label | |
| plt.title('Null Values in DataFrame') # Set the title | |
| plt.xticks(rotation=90) # Rotate x-axis labels if needed | |
| plt.show() # | |
| numeric_cols = [] # Separate the data into numeric_cols and categorical_cols for filling null values. | |
| categorical_cols = [] | |
| for col in df.columns: | |
| if df[col].dtype in ['float64']: # Consider int64 and float64 columns as numeric | |
| numeric_cols.append(col) | |
| elif df[col].dtype == 'object': # Consider object columns as categorical | |
| categorical_cols.append(col) | |
| else: | |
| print(col) | |
| categorical_cols #printing the srting columns | |
| numeric_cols | |
| s_df = df[categorical_cols] | |
| s_df.head() #printing the top 5 rows of string columns | |
| n_df = df[numeric_cols] #new dataframe for float columns | |
| n_df.head() #printing top five rows of float columns | |
| from sklearn.impute import SimpleImputer # for filling the nul values | |
| from sklearn.experimental import enable_iterative_imputer | |
| from sklearn.impute import IterativeImputer | |
| imputer = IterativeImputer() | |
| nums = imputer.fit_transform(n_df) | |
| print(nums) | |
| imputer = SimpleImputer(strategy="most_frequent") #filling most frequent values | |
| s = imputer.fit_transform(s_df) | |
| s | |
| f_1 = pd.DataFrame(s,columns=s_df.columns) | |
| f_2 = pd.DataFrame(nums,columns=n_df.columns) | |
| final_df = pd.concat([f_2,f_1],axis=1) # combining both str,float columns and make a new data frame | |
| print(final_df) | |
| print(final_df.shape) | |
| null_counts1=final_df.isna().sum() #print Nan values or null values after imutatio techiques | |
| print(null_counts1) | |
| #New barlot to show handling of null values | |
| import seaborn as sns | |
| import matplotlib.pyplot as plt | |
| plt.figure(figsize=(6, 6)) | |
| missing_data = final_df.isna() | |
| sns.barplot(x=missing_data.columns, y=missing_data.sum()) | |
| plt.xlabel('Columns') | |
| plt.ylabel('Count of Missing Values') | |
| plt.title('Missing Values by Column') | |
| plt.xticks(rotation=90) | |
| plt.show() | |
| """Question 3: How the categorial textual features are handled?""" | |
| def feature_plot(cols): | |
| num_cols = len(cols)+1 | |
| num_rows = num_cols//2 | |
| num_cols_per_row = 2 | |
| if num_cols <=5: | |
| fig = plt.figure(figsize=(12, 8)) | |
| else: | |
| fig=plt.figure(figsize=(20,12)) | |
| for i, col in enumerate(cols): | |
| ax = fig.add_subplot(num_rows, num_cols_per_row, i+1) | |
| ax.hist(final_df[col], alpha=0.5) | |
| ax.set_title(col) | |
| plt.tight_layout() | |
| plt.show() | |
| feature_plot(categorical_cols) #plot for all string columns | |
| feature_plot(numeric_cols) #plot for all string columns | |
| from sklearn.preprocessing import LabelEncoder # transforming the text data to readable | |
| def tranform_text(col): #using transform_text function for transforming | |
| lb = LabelEncoder() #creating a label encoder named lb | |
| final_df[col] = lb.fit_transform(final_df[col]) #using fit transform method to encode the values | |
| print(col, ':', lb.classes_) | |
| for col in categorical_cols: | |
| tranform_text(col) #transform for all column features | |
| final_df.head() #printing the top 5 rows | |
| """Do any of the variables have multicollinearity issues?""" | |
| plt.figure(figsize=(10, 8)) # Adjust the figure size | |
| sns.heatmap(final_df.corr(), cmap="RdYlBu", annot=True) # Mask correlations below a certain threshold and customize the colormap | |
| plt.title("Correlation Matrix") # Add a title to the plot | |
| plt.show() | |
| #printing the heatmap | |
| # weight,height,gender are identifed as best features | |
| plt.figure(figsize=(10, 8)) # Adjust the figure size | |
| sns.heatmap(final_df.corr(), cmap="RdYlBu", annot=True, mask=final_df.corr() < 0.5) # Maskig the correlatio values which are greater tha 0.5 | |
| plt.title("Correlation Matrix") # Add a title to the plot | |
| plt.show() | |
| """FEATURE SCALING""" | |
| from sklearn.preprocessing import StandardScaler #importing required libraries | |
| from sklearn.model_selection import train_test_split | |
| new_df = final_df.copy() #crearting a new dataframe | |
| new_df.shape #Printing the dimensions of new data frame | |
| def clf_data(x): #tranforming the label data | |
| if x>30: | |
| return 1 # HRD | |
| else: | |
| return 0 #LRD | |
| new_df['Therapeutic Dose of Warfarin'] = new_df['Therapeutic Dose of Warfarin'].apply(clf_data) #transforming the finalized values to the dataframe | |
| new_df['Therapeutic Dose of Warfarin'] | |
| X =new_df.drop('Therapeutic Dose of Warfarin',axis=1) #dropping the target variable | |
| Y = new_df['Therapeutic Dose of Warfarin'] #took target variable as Y | |
| x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state=90) #split of the training and test dataset | |
| print("X-train: ",x_train.shape) #printing the dimensions of all the split datasetts | |
| print("X-test: ",x_test.shape) | |
| print("Y-train ",y_train.shape) | |
| print("Y-test",y_test.shape) | |
| """Question 4: How to identify anomaly data points/outliers (i.e., Cook's Distance, one-class SVM)""" | |
| #CLASSIFICATION MODELS | |
| from sklearn.svm import SVC #importing all the required packages from libraries | |
| from sklearn.linear_model import LogisticRegression | |
| from sklearn.ensemble import RandomForestClassifier | |
| from sklearn.neighbors import KNeighborsClassifier | |
| from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,confusion_matrix #importing all the required metrics | |
| import keras | |
| from keras.models import Sequential | |
| from keras.layers import Dense,Flatten # NN model | |
| def modelBuilding(clf): | |
| clf.fit(x_train,y_train) #fitting the classifierr clf on training data | |
| y_pred = clf.predict(x_test) #predicting the lables for the test data | |
| print('Accuracy-Test:',accuracy_score(y_test,y_pred)) #printing all the required metrics | |
| print('Precision:',precision_score(y_test,y_pred)) | |
| print('Recall:',recall_score(y_test,y_pred)) | |
| print('F1-Score:',f1_score(y_test,y_pred)) | |
| print() | |
| print() | |
| print(clf) | |
| cm = confusion_matrix(y_test, y_pred) #creating a confusion matrix for better analysis | |
| sns.heatmap(cm, annot=True,fmt='.2f',cbar=False,cmap='Greens') #creating an heatmap for the confusion matrix | |
| plt.show() | |
| #displaying the heatmap | |
| list1 = ['SVC()', 'LogisticRegression()', 'KNeighborsClassifier()', 'RandomForestClassifier()'] | |
| classifiers = [] | |
| for item in list1: | |
| classifier = eval(item) | |
| classifiers.append(classifier) | |
| for i in classifiers: | |
| modelBuilding(i) | |
| #PARAMETERS TUNING | |
| from sklearn.model_selection import RandomizedSearchCV #gridcv #performed paramtere tuning with randomsearch cv | |
| def model_Tuning(model,params): | |
| #randomised search | |
| model = RandomizedSearchCV(model,param_distributions=params,cv=5,scoring='accuracy',n_jobs=-1) | |
| model.fit(x_train,y_train) | |
| #printing the best parameters | |
| print('Best Params:',model.best_params_) # getting best params | |
| print('Best Score:',model.best_score_) # getting best scores | |
| return model.best_params_ #return the best parameters | |
| # KNN | |
| params = { 'n_neighbors' : [5,7,9,11], | |
| 'weights' : ['uniform','distance'], | |
| 'metric' : ['minkowski','euclidean','manhattan']} | |
| #printing the best paramteres and the metrics | |
| #tuning the knn parametyers | |
| mt = model_Tuning(KNeighborsClassifier(),params) | |
| knn = KNeighborsClassifier(n_neighbors=mt['n_neighbors'],weights=mt['weights'],metric =mt['metric']) | |
| modelBuilding(knn) #printing the best paramters | |
| params = {'C': [0.1, 1, 10, 100, 1000], 'gamma': [1, 0.1, 0.001, 0.0001], 'kernel': ['linear', 'rbf']} | |
| mt = model_Tuning(SVC(), params) | |
| svm = SVC(C=mt['C'], gamma=mt['gamma'], kernel=mt['kernel']) | |
| modelBuilding(svm) | |
| params = {'bootstrap': [True, False], | |
| 'max_depth': [10, 20, 25], | |
| 'min_samples_leaf': [1, 2, 3], | |
| 'max_features': ['auto', 'sqrt'], | |
| 'min_samples_split': [2, 5, 7], | |
| 'n_estimators': [200, 400, 600, 800, 1000]} | |
| #printing the best paramteres and the metrics | |
| mt = model_Tuning(RandomForestClassifier(),params) #random forest | |
| #paramters tuning for random forest | |
| rf = RandomForestClassifier(bootstrap=mt['bootstrap'],max_depth=mt['max_depth'],max_features=mt['max_features'],min_samples_leaf=mt['min_samples_leaf'],min_samples_split=mt['min_samples_split'],n_estimators=mt['n_estimators']) | |
| modelBuilding(rf) | |
| params = { | |
| 'penalty':['l1','l2'], | |
| 'C':[0.001,0.01,0.1,1,10,100], | |
| } | |
| #printing the best paramteres and the metrics | |
| mt = model_Tuning(LogisticRegression(),params) | |
| lr = LogisticRegression(penalty=mt['penalty'],C=mt['C']) | |
| modelBuilding(lr) | |
| """Neural Network - Model""" | |
| y_train = np.asarray(y_train).astype('float32').reshape((-1,1)) # reshaping the data | |
| y_test = np.asarray(y_test).astype('float32').reshape((-1,1)) | |
| # Neural network | |
| model = Sequential() | |
| model.add(Dense(256, input_dim=10, activation='relu')) #input features as 10 function relu | |
| model.add(Dense(128, activation='relu')) #neurons 128 | |
| model.add(Flatten()) | |
| model.add(Dense(1, activation='sigmoid')) | |
| model.summary() | |
| model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) #using binary crossentropy | |
| history = model.fit(x_train, y_train, epochs=100, batch_size=28,validation_data=(x_test, y_test)) #fit the model on train data | |
| # plotting test and train accuracy | |
| plt.plot(history.history['accuracy']) #plotting the training accuracy | |
| plt.plot(history.history['val_accuracy']) #plotting the validation accuracy | |
| plt.title('Model accuracy') #setting the title for the plot | |
| plt.ylabel('Accuracy') #Setting Y axis label | |
| plt.xlabel('Epoch') #set x axis label | |
| plt.legend(['Train', 'Test'], loc='upper right') #adding legend to the plot | |
| plt.show() #displaying the plot | |
| # plotting test and train loss | |
| plt.plot(history.history['loss']) #plotting the training loss | |
| plt.plot(history.history['val_loss']) #plotting the validation loss | |
| plt.title('Model loss') #setting the name for the plot | |
| plt.ylabel('Loss') #setting y -axis | |
| plt.xlabel('Epoch') #setting x -axis | |
| plt.legend(['Train', 'Test'], loc='upper right') #adding legend | |
| plt.show() | |
| """ Regression Models""" | |
| X = final_df.drop('Therapeutic Dose of Warfarin',axis=1) #dropping target variable in the Xdataframe | |
| Y = final_df['Therapeutic Dose of Warfarin'] #setting target variable in Y dataframe | |
| std = StandardScaler() #standarization | |
| X = std.fit_transform(X) # Feature scaling | |
| x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state=90) #splitting the | |
| from sklearn.linear_model import LinearRegression # importing all the required packages | |
| from sklearn.tree import DecisionTreeRegressor | |
| from sklearn.ensemble import RandomForestRegressor | |
| from sklearn.metrics import r2_score,mean_squared_error,mean_absolute_error | |
| def regressor(model): #defining a regression model | |
| model.fit(x_train,y_train) #fit transform on traindata | |
| y_pred = model.predict(x_test) #prediction on test data | |
| #printing all the required metrics | |
| print('R2-Score:',r2_score(y_test,y_pred)) | |
| print('MSE:',mean_squared_error(y_test,y_pred)) | |
| print('MAE:',mean_absolute_error(y_test,y_pred)) | |
| # Linear Regression | |
| lir = LinearRegression() | |
| regressor(lir) | |
| # Random Forest | |
| rfr = RandomForestRegressor() | |
| regressor(rfr) | |
| # Tree Regression | |
| tree = DecisionTreeRegressor() | |
| regressor(tree) | |
| rf.feature_importances_ # selecting the features | |
| new_df.columns #printing the features of the dataset | |
| new11 = new_df.drop(columns=['Therapeutic Dose of Warfarin']) | |
| plt.barh(new11.columns, rf.feature_importances_,color='g') #plot bar graph for all the features | |
| feature_df = new_df[['Weight (kg)','Height (cm)','INR on Reported Therapeutic Dose of Warfarin','Age','VKORC1 genotype:','Therapeutic Dose of Warfarin']] | |
| feature_df.head() # Top 5 features. | |
| feature_df.shape #printing the dimensions of the dataframe | |
| X = feature_df.drop('Therapeutic Dose of Warfarin',axis=1) #dropping the target variable for the new dataframe | |
| Y = feature_df['Therapeutic Dose of Warfarin'] #Y has the target feature | |
| std = StandardScaler() | |
| X = std.fit_transform(X) # Feature scaling | |
| x_train,x_test,y_train,y_test = train_test_split(X,Y,test_size=0.2,random_state=90) #splitting the data into training and test sets | |
| """FINAL MODELS FOR PREDICTION""" | |
| def model_buliding(model,filename): #Final mdoels and saving the models | |
| clf = model.fit(x_train,y_train) | |
| y_pred = model.predict(x_test) | |
| #model_save | |
| pickle.dump(model, open(filename, 'wb')) # Saving the model | |
| print('Accuracy-Test:',accuracy_score(y_test,y_pred)) # accuracy | |
| print('Precision:',precision_score(y_test,y_pred)) # Precision | |
| print('Recall:',recall_score(y_test,y_pred)) # recall | |
| print('F1-Score:',f1_score(y_test,y_pred)) # f1-score | |
| cm = confusion_matrix(y_test, y_pred) # ploting confusion matrix | |
| sns.heatmap(cm, annot=True,fmt='.2f',cbar=False,cmap='Greens') | |
| plt.show() | |
| model_buliding(lr,'lr.pkl') # logistic Regression | |
| model_buliding(rf,'rf.pkl') # Random Forest | |
| model_buliding(svm,'svm.pkl') # SVM | |
| model_buliding(knn,'knn.pkl') # K_NN | |
| y_train = np.asarray(y_train).astype('float32').reshape((-1,1)) | |
| y_test = np.asarray(y_test).astype('float32').reshape((-1,1)) | |
| y_test.shape #printing the dimensions | |
| # Neural network | |
| model = Sequential() | |
| model.add(Dense(256, input_dim=5, activation='relu')) #took only top 5 features as input | |
| model.add(Dense(148, activation='relu')) | |
| model.add(Dense(64, activation='relu')) | |
| model.add(Flatten()) | |
| model.add(Dense(1, activation='sigmoid')) | |
| model.summary() | |
| model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) #compiling the model | |
| history = model.fit(x_train, y_train, epochs=100, batch_size=32,validation_data=(x_test, y_test)) #fitting the model | |
| #plotting the model accuracy | |
| plt.plot(history.history['accuracy']) | |
| plt.plot(history.history['val_accuracy']) | |
| plt.title('Model accuracy') | |
| plt.ylabel('Accuracy') | |
| plt.xlabel('Epoch') | |
| plt.legend(['Train', 'Test'], loc='upper right') | |
| plt.show() | |
| #plotting the model loss | |
| plt.plot(history.history['loss']) | |
| plt.plot(history.history['val_loss']) | |
| plt.title('Model LOSS') | |
| plt.ylabel('LOSS') | |
| plt.xlabel('Epoch') | |
| plt.legend(['Train', 'Test'], loc='upper right') | |
| plt.show() | |
| y_pred = np.argmax(model.predict(x_test),axis=1) #generating the predictions using test data | |
| print('Accuracy-Test:',accuracy_score(y_test,y_pred)) # accuracy | |
| print('Precision:',precision_score(y_test,y_pred)) # Precision | |
| print('Recall:',recall_score(y_test,y_pred)) # recall | |
| print('F1-Score:',f1_score(y_test,y_pred)) # f1-score | |
| cm = confusion_matrix(y_test, y_pred) # ploting confusion matrix | |
| sns.heatmap(cm, annot=True,fmt='.2f',cbar=False,cmap='Greens') | |
| plt.show() | |
| """Gradio Implementation""" | |
| import gradio as gd | |
| def model_prediction(weight, height,INR_on_Reported,age,VKROC1,model): | |
| data = [[weight,height,INR_on_Reported,age,VKROC1]] # getting inputs | |
| data = std.transform(data) # scaling the data | |
| if model=='Logistic': # loading models | |
| file = open("/content/lr.pkl",'rb') | |
| model = pickle.load(file) | |
| file.close() | |
| prediction = model.predict(data)[0] #prediction | |
| elif model=='KNN': #loading KNN model | |
| file = open("/content/knn.pkl",'rb') | |
| model = pickle.load(file) | |
| file.close() | |
| prediction = model.predict(data)[0] | |
| elif model=='SVM': #loading SVM model | |
| file = open("/content/svm.pkl",'rb') | |
| model = pickle.load(file) | |
| file.close() | |
| prediction = model.predict(data)[0] | |
| elif model=='RForest': #loading randomforest nodel | |
| file = open("/content/rf.pkl",'rb') | |
| model = pickle.load(file) | |
| file.close() | |
| prediction = model.predict(data)[0] | |
| if prediction == 1: # Final Outputs | |
| return 'HRD(1)' | |
| else: | |
| return 'LRD(0)' | |
| return prediction | |
| gd.Interface(fn=model_prediction,inputs=["text", "text","text","text","text",gd.Dropdown(['Logistic','KNN','SVM','RForest'])], outputs=["text"]).launch() | |
| feature_df.head(3) | |