Mridul2003's picture
Upload 25 files
7a4db40
# Importing libraries.
import time
import streamlit as st
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix
from ml.mlmodel import MLModels
def eda(df):
'''
Description:
Method that provides various EDA options.
Parameters:
df - A pandas dataframe.
Returns:
Nothing.
'''
rows, columns = df.shape[0], df.shape[1]
st.info(f'Rows = {rows}, Columns = {columns}')
if st.checkbox('Show Target Classes and Value Counts'):
target_classes = df.target.value_counts()
st.dataframe(target_classes)
if st.checkbox("Show DataFrame"):
num_rows = st.number_input(label="Enter number of rows", min_value=5, max_value=rows)
st.dataframe(df.head(num_rows))
if st.checkbox("Describe The Data"):
st.dataframe(df.describe())
if st.checkbox("Show DataFrame By Specific Columns"):
column_names = st.multiselect("Select Columns", df.columns)
st.dataframe(df[column_names])
if st.checkbox("Show Data Types"):
st.dataframe(df.dtypes)
def vis(df):
'''
Description:
Method for various visualization options.
Parameters:
df - A pandas dataframe.
Returns:
Nothing.
'''
if st.button("Correlational Matrix"):
with st.spinner('Generating A Correlational Matrix...'):
time.sleep(3)
sns.heatmap(df.corr(), annot=True)
st.pyplot()
if st.button("Value Counts"):
with st.spinner('Generating A Value Count Plot...'):
time.sleep(3)
df.target.value_counts().plot(kind='barh')
st.pyplot()
if st.button("Pair Plot"):
with st.spinner('Generating A Pair Plot...'):
time.sleep(3)
sns.pairplot(df, hue='target')
st.pyplot()
if st.button("Pie Chart"):
with st.spinner('Generating A Pie Chart...'):
time.sleep(3)
df.target.value_counts().plot.pie(autopct='%1.2f%%')
st.pyplot()
if st.checkbox('Scatter Plot'):
x_val = st.selectbox('Select a column for x-axis', df.columns)
y_val = st.selectbox('Select a column for y-axis', df.columns)
with st.spinner('Generating A Scatter Plot...'):
time.sleep(3)
plt.scatter(df[x_val], df[y_val], c=df.target)
plt.xlabel(x_val)
plt.ylabel(y_val)
st.pyplot()
def ml(df):
'''
Description:
Method for handling all the machine learning options.
Parameters:
df - A pandas dataframe.
Returns:
Nothing.
'''
def run_ml_model(model_name):
'''
Description:
An inner method for running a machine learning model.
Parameters:
model_name - A machine learning model name as a string.
Returns:
Nothing.
'''
if model_name == 'Linear Regression':
lin_reg = clf.linear_regression()
lin_reg.fit(x_train, y_train)
coeff = lin_reg.coef_
intercept = lin_reg.intercept_
st.success(f'The coefficients = {coeff}')
st.success(f'The intercept = {intercept}')
st.write('Now make an equation of the form y = a1*x1 + a2*x2 + ... an*xn + c')
st.write('and plugin the features and compare the value you get with the actual target value.')
st.info('NOTE: Linear Regression is not for classification problems. Hence, use it for Boston Houses or Diabetes dataset to understand this algorithm deeply.')
elif model_name == 'Logistic Regression':
C = st.slider(label='Choose C', min_value=0.1, max_value=5.0)
log_reg = clf.logistic_regression(C)
train_and_display_metrics(log_reg)
if st.checkbox('KFold Cross Validation'):
run_kfold(log_reg)
elif model_name == 'K Nearest Neighbors':
n_neighbors = st.number_input(label='n_neighbors', min_value=5, max_value=100)
knn = clf.k_nearest_neighbors(n_neighbors)
train_and_display_metrics(knn)
if st.checkbox('KFold Cross Validation'):
run_kfold(knn)
st.info('NOTE: It is often a good practice to scale the features when using KNN because it uses Eucledian distances. However, this topic comes under feature engineering (intermediate level).')
elif model_name == 'Naive Bayes (Gaussian)':
nbg = clf.naive_bayes()
train_and_display_metrics(nbg)
if st.checkbox('KFold Cross Validation'):
run_kfold(nbg)
elif model_name == 'SVM':
C = st.slider(label='Choose C', min_value=0.1, max_value=5.0)
kernel = st.selectbox('Kernel', ['rbf', 'poly', 'linear'])
svm = clf.svm(C, kernel)
train_and_display_metrics(svm)
if st.checkbox('KFold Cross Validation'):
run_kfold(svm)
elif model_name == 'Decision Tree':
max_depth = st.number_input(label='max_depth', min_value=10, max_value=100)
dt = clf.decision_tree(max_depth)
train_and_display_metrics(dt)
if st.checkbox('KFold Cross Validation'):
run_kfold(dt)
elif model_name == 'Random Forest':
n_estimators = st.number_input('n_estimators', min_value=100, max_value=1000)
max_depth = st.number_input(label='max_depth', min_value=10, max_value=100)
rf = clf.random_forest(n_estimators, max_depth)
train_and_display_metrics(rf)
if st.checkbox('KFold Cross Validation'):
run_kfold(rf)
def train_and_display_metrics(model):
'''
Description:
Method to train the model and display its accuracy.
Parameters:
model - A ML model (from sklearn).
Returns:
Nothing.
'''
model.fit(x_train, y_train)
y_pred_test = model.predict(x_test)
y_pred_train = model.predict(x_train)
st.success(f'Train accuracy = {accuracy_score(y_train, y_pred_train)*100:.5f}%')
st.success(f'Test accuracy = {accuracy_score(y_test, y_pred_test)*100:.5f}%')
if st.button('Show Confusion Matrix'):
cf_matrix = confusion_matrix(y_test, y_pred_test)
sns.heatmap(cf_matrix, annot=True)
st.pyplot()
def run_kfold(model):
'''
Description:
Method for running kfold cross validation.
Parameters:
model - A ML model (from sklearn).
Returns:
Nothing.
'''
cv = st.number_input(label='Choose number of folds', min_value=5, max_value=20)
cv_score = cross_val_score(model,x,y, cv=cv)
sum = 0
for s in cv_score:
sum += s
avg_score = sum/cv
st.write(f'According to {cv} kfolds, the following test accuracies have been recorded:')
st.dataframe(cv_score)
st.success(f'Average test accuracy = {avg_score*100:.5f}%')
clf = MLModels()
x = df.iloc[:, :-1]
y = df.iloc[:, -1]
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
model_name = st.selectbox("Choose a model/algorithm", ["Linear Regression", "Logistic Regression", "K Nearest Neighbors", "Naive Bayes (Gaussian)", "SVM", "Decision Tree", "Random Forest"])
run_ml_model(model_name)