Spaces:
No application file
No application file
| # Importing libraries. | |
| import time | |
| import streamlit as st | |
| import seaborn as sns | |
| import matplotlib.pyplot as plt | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.model_selection import cross_val_score | |
| from sklearn.metrics import accuracy_score, confusion_matrix | |
| from ml.mlmodel import MLModels | |
| def eda(df): | |
| ''' | |
| Description: | |
| Method that provides various EDA options. | |
| Parameters: | |
| df - A pandas dataframe. | |
| Returns: | |
| Nothing. | |
| ''' | |
| rows, columns = df.shape[0], df.shape[1] | |
| st.info(f'Rows = {rows}, Columns = {columns}') | |
| if st.checkbox('Show Target Classes and Value Counts'): | |
| target_classes = df.target.value_counts() | |
| st.dataframe(target_classes) | |
| if st.checkbox("Show DataFrame"): | |
| num_rows = st.number_input(label="Enter number of rows", min_value=5, max_value=rows) | |
| st.dataframe(df.head(num_rows)) | |
| if st.checkbox("Describe The Data"): | |
| st.dataframe(df.describe()) | |
| if st.checkbox("Show DataFrame By Specific Columns"): | |
| column_names = st.multiselect("Select Columns", df.columns) | |
| st.dataframe(df[column_names]) | |
| if st.checkbox("Show Data Types"): | |
| st.dataframe(df.dtypes) | |
| def vis(df): | |
| ''' | |
| Description: | |
| Method for various visualization options. | |
| Parameters: | |
| df - A pandas dataframe. | |
| Returns: | |
| Nothing. | |
| ''' | |
| if st.button("Correlational Matrix"): | |
| with st.spinner('Generating A Correlational Matrix...'): | |
| time.sleep(3) | |
| sns.heatmap(df.corr(), annot=True) | |
| st.pyplot() | |
| if st.button("Value Counts"): | |
| with st.spinner('Generating A Value Count Plot...'): | |
| time.sleep(3) | |
| df.target.value_counts().plot(kind='barh') | |
| st.pyplot() | |
| if st.button("Pair Plot"): | |
| with st.spinner('Generating A Pair Plot...'): | |
| time.sleep(3) | |
| sns.pairplot(df, hue='target') | |
| st.pyplot() | |
| if st.button("Pie Chart"): | |
| with st.spinner('Generating A Pie Chart...'): | |
| time.sleep(3) | |
| df.target.value_counts().plot.pie(autopct='%1.2f%%') | |
| st.pyplot() | |
| if st.checkbox('Scatter Plot'): | |
| x_val = st.selectbox('Select a column for x-axis', df.columns) | |
| y_val = st.selectbox('Select a column for y-axis', df.columns) | |
| with st.spinner('Generating A Scatter Plot...'): | |
| time.sleep(3) | |
| plt.scatter(df[x_val], df[y_val], c=df.target) | |
| plt.xlabel(x_val) | |
| plt.ylabel(y_val) | |
| st.pyplot() | |
| def ml(df): | |
| ''' | |
| Description: | |
| Method for handling all the machine learning options. | |
| Parameters: | |
| df - A pandas dataframe. | |
| Returns: | |
| Nothing. | |
| ''' | |
| def run_ml_model(model_name): | |
| ''' | |
| Description: | |
| An inner method for running a machine learning model. | |
| Parameters: | |
| model_name - A machine learning model name as a string. | |
| Returns: | |
| Nothing. | |
| ''' | |
| if model_name == 'Linear Regression': | |
| lin_reg = clf.linear_regression() | |
| lin_reg.fit(x_train, y_train) | |
| coeff = lin_reg.coef_ | |
| intercept = lin_reg.intercept_ | |
| st.success(f'The coefficients = {coeff}') | |
| st.success(f'The intercept = {intercept}') | |
| st.write('Now make an equation of the form y = a1*x1 + a2*x2 + ... an*xn + c') | |
| st.write('and plugin the features and compare the value you get with the actual target value.') | |
| st.info('NOTE: Linear Regression is not for classification problems. Hence, use it for Boston Houses or Diabetes dataset to understand this algorithm deeply.') | |
| elif model_name == 'Logistic Regression': | |
| C = st.slider(label='Choose C', min_value=0.1, max_value=5.0) | |
| log_reg = clf.logistic_regression(C) | |
| train_and_display_metrics(log_reg) | |
| if st.checkbox('KFold Cross Validation'): | |
| run_kfold(log_reg) | |
| elif model_name == 'K Nearest Neighbors': | |
| n_neighbors = st.number_input(label='n_neighbors', min_value=5, max_value=100) | |
| knn = clf.k_nearest_neighbors(n_neighbors) | |
| train_and_display_metrics(knn) | |
| if st.checkbox('KFold Cross Validation'): | |
| run_kfold(knn) | |
| st.info('NOTE: It is often a good practice to scale the features when using KNN because it uses Eucledian distances. However, this topic comes under feature engineering (intermediate level).') | |
| elif model_name == 'Naive Bayes (Gaussian)': | |
| nbg = clf.naive_bayes() | |
| train_and_display_metrics(nbg) | |
| if st.checkbox('KFold Cross Validation'): | |
| run_kfold(nbg) | |
| elif model_name == 'SVM': | |
| C = st.slider(label='Choose C', min_value=0.1, max_value=5.0) | |
| kernel = st.selectbox('Kernel', ['rbf', 'poly', 'linear']) | |
| svm = clf.svm(C, kernel) | |
| train_and_display_metrics(svm) | |
| if st.checkbox('KFold Cross Validation'): | |
| run_kfold(svm) | |
| elif model_name == 'Decision Tree': | |
| max_depth = st.number_input(label='max_depth', min_value=10, max_value=100) | |
| dt = clf.decision_tree(max_depth) | |
| train_and_display_metrics(dt) | |
| if st.checkbox('KFold Cross Validation'): | |
| run_kfold(dt) | |
| elif model_name == 'Random Forest': | |
| n_estimators = st.number_input('n_estimators', min_value=100, max_value=1000) | |
| max_depth = st.number_input(label='max_depth', min_value=10, max_value=100) | |
| rf = clf.random_forest(n_estimators, max_depth) | |
| train_and_display_metrics(rf) | |
| if st.checkbox('KFold Cross Validation'): | |
| run_kfold(rf) | |
| def train_and_display_metrics(model): | |
| ''' | |
| Description: | |
| Method to train the model and display its accuracy. | |
| Parameters: | |
| model - A ML model (from sklearn). | |
| Returns: | |
| Nothing. | |
| ''' | |
| model.fit(x_train, y_train) | |
| y_pred_test = model.predict(x_test) | |
| y_pred_train = model.predict(x_train) | |
| st.success(f'Train accuracy = {accuracy_score(y_train, y_pred_train)*100:.5f}%') | |
| st.success(f'Test accuracy = {accuracy_score(y_test, y_pred_test)*100:.5f}%') | |
| if st.button('Show Confusion Matrix'): | |
| cf_matrix = confusion_matrix(y_test, y_pred_test) | |
| sns.heatmap(cf_matrix, annot=True) | |
| st.pyplot() | |
| def run_kfold(model): | |
| ''' | |
| Description: | |
| Method for running kfold cross validation. | |
| Parameters: | |
| model - A ML model (from sklearn). | |
| Returns: | |
| Nothing. | |
| ''' | |
| cv = st.number_input(label='Choose number of folds', min_value=5, max_value=20) | |
| cv_score = cross_val_score(model,x,y, cv=cv) | |
| sum = 0 | |
| for s in cv_score: | |
| sum += s | |
| avg_score = sum/cv | |
| st.write(f'According to {cv} kfolds, the following test accuracies have been recorded:') | |
| st.dataframe(cv_score) | |
| st.success(f'Average test accuracy = {avg_score*100:.5f}%') | |
| clf = MLModels() | |
| x = df.iloc[:, :-1] | |
| y = df.iloc[:, -1] | |
| x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2) | |
| model_name = st.selectbox("Choose a model/algorithm", ["Linear Regression", "Logistic Regression", "K Nearest Neighbors", "Naive Bayes (Gaussian)", "SVM", "Decision Tree", "Random Forest"]) | |
| run_ml_model(model_name) | |