|
|
|
|
|
import time |
|
|
import streamlit as st |
|
|
import seaborn as sns |
|
|
import matplotlib.pyplot as plt |
|
|
from sklearn.model_selection import train_test_split |
|
|
from sklearn.model_selection import cross_val_score |
|
|
from sklearn.metrics import accuracy_score, confusion_matrix |
|
|
from ml.mlmodel import MLModels |
|
|
|
|
|
|
|
|
|
|
|
def eda(df): |
|
|
''' |
|
|
Description: |
|
|
Method that provides various EDA options. |
|
|
|
|
|
Parameters: |
|
|
df - A pandas dataframe. |
|
|
|
|
|
Returns: |
|
|
Nothing. |
|
|
''' |
|
|
rows, columns = df.shape[0], df.shape[1] |
|
|
st.info(f'Rows = {rows}, Columns = {columns}') |
|
|
if st.checkbox('Show Target Classes and Value Counts'): |
|
|
target_classes = df.target.value_counts() |
|
|
st.dataframe(target_classes) |
|
|
if st.checkbox("Show DataFrame"): |
|
|
num_rows = st.number_input(label="Enter number of rows", min_value=5, max_value=rows) |
|
|
st.dataframe(df.head(num_rows)) |
|
|
if st.checkbox("Describe The Data"): |
|
|
st.dataframe(df.describe()) |
|
|
if st.checkbox("Show DataFrame By Specific Columns"): |
|
|
column_names = st.multiselect("Select Columns", df.columns) |
|
|
st.dataframe(df[column_names]) |
|
|
if st.checkbox("Show Data Types"): |
|
|
st.dataframe(df.dtypes) |
|
|
|
|
|
def vis(df): |
|
|
''' |
|
|
Description: |
|
|
Method for various visualization options. |
|
|
|
|
|
Parameters: |
|
|
df - A pandas dataframe. |
|
|
|
|
|
Returns: |
|
|
Nothing. |
|
|
''' |
|
|
if st.button("Correlational Matrix"): |
|
|
with st.spinner('Generating A Correlational Matrix...'): |
|
|
time.sleep(3) |
|
|
sns.heatmap(df.corr(), annot=True) |
|
|
st.pyplot() |
|
|
if st.button("Value Counts"): |
|
|
with st.spinner('Generating A Value Count Plot...'): |
|
|
time.sleep(3) |
|
|
df.target.value_counts().plot(kind='barh') |
|
|
st.pyplot() |
|
|
if st.button("Pair Plot"): |
|
|
with st.spinner('Generating A Pair Plot...'): |
|
|
time.sleep(3) |
|
|
sns.pairplot(df, hue='target') |
|
|
st.pyplot() |
|
|
if st.button("Pie Chart"): |
|
|
with st.spinner('Generating A Pie Chart...'): |
|
|
time.sleep(3) |
|
|
df.target.value_counts().plot.pie(autopct='%1.2f%%') |
|
|
st.pyplot() |
|
|
if st.checkbox('Scatter Plot'): |
|
|
x_val = st.selectbox('Select a column for x-axis', df.columns) |
|
|
y_val = st.selectbox('Select a column for y-axis', df.columns) |
|
|
with st.spinner('Generating A Scatter Plot...'): |
|
|
time.sleep(3) |
|
|
plt.scatter(df[x_val], df[y_val], c=df.target) |
|
|
plt.xlabel(x_val) |
|
|
plt.ylabel(y_val) |
|
|
st.pyplot() |
|
|
|
|
|
def ml(df): |
|
|
''' |
|
|
Description: |
|
|
Method for handling all the machine learning options. |
|
|
|
|
|
Parameters: |
|
|
df - A pandas dataframe. |
|
|
|
|
|
Returns: |
|
|
Nothing. |
|
|
''' |
|
|
def run_ml_model(model_name): |
|
|
''' |
|
|
Description: |
|
|
An inner method for running a machine learning model. |
|
|
|
|
|
Parameters: |
|
|
model_name - A machine learning model name as a string. |
|
|
|
|
|
Returns: |
|
|
Nothing. |
|
|
''' |
|
|
if model_name == 'Linear Regression': |
|
|
|
|
|
lin_reg = clf.linear_regression() |
|
|
lin_reg.fit(x_train, y_train) |
|
|
coeff = lin_reg.coef_ |
|
|
intercept = lin_reg.intercept_ |
|
|
st.success(f'The coefficients = {coeff}') |
|
|
st.success(f'The intercept = {intercept}') |
|
|
st.write('Now make an equation of the form y = a1*x1 + a2*x2 + ... an*xn + c') |
|
|
st.write('and plugin the features and compare the value you get with the actual target value.') |
|
|
st.info('NOTE: Linear Regression is not for classification problems. Hence, use it for Boston Houses or Diabetes dataset to understand this algorithm deeply.') |
|
|
elif model_name == 'Logistic Regression': |
|
|
|
|
|
C = st.slider(label='Choose C', min_value=0.1, max_value=5.0) |
|
|
log_reg = clf.logistic_regression(C) |
|
|
train_and_display_metrics(log_reg) |
|
|
if st.checkbox('KFold Cross Validation'): |
|
|
run_kfold(log_reg) |
|
|
elif model_name == 'K Nearest Neighbors': |
|
|
|
|
|
n_neighbors = st.number_input(label='n_neighbors', min_value=5, max_value=100) |
|
|
knn = clf.k_nearest_neighbors(n_neighbors) |
|
|
train_and_display_metrics(knn) |
|
|
if st.checkbox('KFold Cross Validation'): |
|
|
run_kfold(knn) |
|
|
st.info('NOTE: It is often a good practice to scale the features when using KNN because it uses Eucledian distances. However, this topic comes under feature engineering (intermediate level).') |
|
|
elif model_name == 'Naive Bayes (Gaussian)': |
|
|
|
|
|
nbg = clf.naive_bayes() |
|
|
train_and_display_metrics(nbg) |
|
|
if st.checkbox('KFold Cross Validation'): |
|
|
run_kfold(nbg) |
|
|
elif model_name == 'SVM': |
|
|
|
|
|
C = st.slider(label='Choose C', min_value=0.1, max_value=5.0) |
|
|
kernel = st.selectbox('Kernel', ['rbf', 'poly', 'linear']) |
|
|
svm = clf.svm(C, kernel) |
|
|
train_and_display_metrics(svm) |
|
|
if st.checkbox('KFold Cross Validation'): |
|
|
run_kfold(svm) |
|
|
elif model_name == 'Decision Tree': |
|
|
|
|
|
max_depth = st.number_input(label='max_depth', min_value=10, max_value=100) |
|
|
dt = clf.decision_tree(max_depth) |
|
|
train_and_display_metrics(dt) |
|
|
if st.checkbox('KFold Cross Validation'): |
|
|
run_kfold(dt) |
|
|
elif model_name == 'Random Forest': |
|
|
|
|
|
n_estimators = st.number_input('n_estimators', min_value=100, max_value=1000) |
|
|
max_depth = st.number_input(label='max_depth', min_value=10, max_value=100) |
|
|
rf = clf.random_forest(n_estimators, max_depth) |
|
|
train_and_display_metrics(rf) |
|
|
if st.checkbox('KFold Cross Validation'): |
|
|
run_kfold(rf) |
|
|
|
|
|
def train_and_display_metrics(model): |
|
|
''' |
|
|
Description: |
|
|
Method to train the model and display its accuracy. |
|
|
|
|
|
Parameters: |
|
|
model - A ML model (from sklearn). |
|
|
|
|
|
Returns: |
|
|
Nothing. |
|
|
''' |
|
|
model.fit(x_train, y_train) |
|
|
y_pred_test = model.predict(x_test) |
|
|
y_pred_train = model.predict(x_train) |
|
|
st.success(f'Train accuracy = {accuracy_score(y_train, y_pred_train)*100:.5f}%') |
|
|
st.success(f'Test accuracy = {accuracy_score(y_test, y_pred_test)*100:.5f}%') |
|
|
if st.button('Show Confusion Matrix'): |
|
|
cf_matrix = confusion_matrix(y_test, y_pred_test) |
|
|
sns.heatmap(cf_matrix, annot=True) |
|
|
st.pyplot() |
|
|
|
|
|
def run_kfold(model): |
|
|
''' |
|
|
Description: |
|
|
Method for running kfold cross validation. |
|
|
|
|
|
Parameters: |
|
|
model - A ML model (from sklearn). |
|
|
|
|
|
Returns: |
|
|
Nothing. |
|
|
''' |
|
|
cv = st.number_input(label='Choose number of folds', min_value=5, max_value=20) |
|
|
cv_score = cross_val_score(model,x,y, cv=cv) |
|
|
sum = 0 |
|
|
for s in cv_score: |
|
|
sum += s |
|
|
|
|
|
avg_score = sum/cv |
|
|
st.write(f'According to {cv} kfolds, the following test accuracies have been recorded:') |
|
|
st.dataframe(cv_score) |
|
|
st.success(f'Average test accuracy = {avg_score*100:.5f}%') |
|
|
|
|
|
clf = MLModels() |
|
|
x = df.iloc[:, :-1] |
|
|
y = df.iloc[:, -1] |
|
|
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2) |
|
|
|
|
|
model_name = st.selectbox("Choose a model/algorithm", ["Linear Regression", "Logistic Regression", "K Nearest Neighbors", "Naive Bayes (Gaussian)", "SVM", "Decision Tree", "Random Forest"]) |
|
|
run_ml_model(model_name) |
|
|
|
|
|
|