File size: 7,851 Bytes

7a4db40

# Importing libraries.
import time
import streamlit as st
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix
from ml.mlmodel import MLModels



def eda(df):
    '''
    Description: 
            Method that provides various EDA options.
            
    Parameters: 
            df - A pandas dataframe.
    
    Returns: 
           Nothing. 
    '''
    rows, columns = df.shape[0], df.shape[1]
    st.info(f'Rows = {rows}, Columns = {columns}')
    if st.checkbox('Show Target Classes and Value Counts'):
        target_classes = df.target.value_counts()
        st.dataframe(target_classes)
    if st.checkbox("Show DataFrame"):
        num_rows = st.number_input(label="Enter number of rows", min_value=5, max_value=rows)
        st.dataframe(df.head(num_rows))
    if st.checkbox("Describe The Data"):
        st.dataframe(df.describe())
    if st.checkbox("Show DataFrame By Specific Columns"):
        column_names = st.multiselect("Select Columns", df.columns)
        st.dataframe(df[column_names])
    if st.checkbox("Show Data Types"):
        st.dataframe(df.dtypes)

def vis(df):
    '''
    Description: 
            Method for various visualization options.
            
    Parameters: 
            df - A pandas dataframe.
    
    Returns: 
           Nothing. 
    '''
    if st.button("Correlational Matrix"):
        with st.spinner('Generating A Correlational Matrix...'):
            time.sleep(3)
        sns.heatmap(df.corr(), annot=True)
        st.pyplot()
    if st.button("Value Counts"):
        with st.spinner('Generating A Value Count Plot...'):
            time.sleep(3)
        df.target.value_counts().plot(kind='barh')
        st.pyplot()
    if st.button("Pair Plot"):
        with st.spinner('Generating A Pair Plot...'):
            time.sleep(3)
        sns.pairplot(df, hue='target')
        st.pyplot()
    if st.button("Pie Chart"):
        with st.spinner('Generating A Pie Chart...'):
            time.sleep(3)
        df.target.value_counts().plot.pie(autopct='%1.2f%%')
        st.pyplot()
    if st.checkbox('Scatter Plot'):
        x_val = st.selectbox('Select a column for x-axis', df.columns)
        y_val = st.selectbox('Select a column for y-axis', df.columns)
        with st.spinner('Generating A Scatter Plot...'):
            time.sleep(3) 
        plt.scatter(df[x_val], df[y_val], c=df.target)
        plt.xlabel(x_val)
        plt.ylabel(y_val)
        st.pyplot()

def ml(df):
    '''
    Description: 
            Method for handling all the machine learning options.
            
    Parameters: 
            df - A pandas dataframe.
    
    Returns: 
           Nothing. 
    '''
    def run_ml_model(model_name):
        '''
        Description: 
                An inner method for running a machine learning model.
            
        Parameters: 
                model_name - A machine learning model name as a string.
    
        Returns: 
                Nothing.
        '''
        if model_name == 'Linear Regression':
           
            lin_reg = clf.linear_regression()
            lin_reg.fit(x_train, y_train)
            coeff = lin_reg.coef_
            intercept = lin_reg.intercept_
            st.success(f'The coefficients = {coeff}')
            st.success(f'The intercept = {intercept}')
            st.write('Now make an equation of the form y = a1*x1 + a2*x2 + ... an*xn + c')
            st.write('and plugin the features and compare the value you get with the actual target value.')
            st.info('NOTE: Linear Regression is not for classification problems. Hence, use it for Boston Houses or Diabetes dataset to understand this algorithm deeply.')
        elif model_name == 'Logistic Regression':
           
            C = st.slider(label='Choose C', min_value=0.1, max_value=5.0)
            log_reg = clf.logistic_regression(C)
            train_and_display_metrics(log_reg)
            if st.checkbox('KFold Cross Validation'):
                run_kfold(log_reg) 
        elif model_name == 'K Nearest Neighbors':

            n_neighbors = st.number_input(label='n_neighbors', min_value=5, max_value=100)
            knn = clf.k_nearest_neighbors(n_neighbors)
            train_and_display_metrics(knn)
            if st.checkbox('KFold Cross Validation'):
                run_kfold(knn)
            st.info('NOTE: It is often a good practice to scale the features when using KNN because it uses Eucledian distances. However, this topic comes under feature engineering (intermediate level).')
        elif model_name == 'Naive Bayes (Gaussian)':
            
            nbg = clf.naive_bayes()
            train_and_display_metrics(nbg)
            if st.checkbox('KFold Cross Validation'):
                run_kfold(nbg)
        elif model_name == 'SVM':
           
            C = st.slider(label='Choose C', min_value=0.1, max_value=5.0)
            kernel = st.selectbox('Kernel', ['rbf', 'poly', 'linear'])
            svm = clf.svm(C, kernel)
            train_and_display_metrics(svm)
            if st.checkbox('KFold Cross Validation'):
                run_kfold(svm) 
        elif model_name == 'Decision Tree':
            
            max_depth = st.number_input(label='max_depth', min_value=10, max_value=100)
            dt = clf.decision_tree(max_depth)
            train_and_display_metrics(dt)
            if st.checkbox('KFold Cross Validation'):
                run_kfold(dt) 
        elif model_name == 'Random Forest':
            
            n_estimators = st.number_input('n_estimators', min_value=100, max_value=1000)
            max_depth = st.number_input(label='max_depth', min_value=10, max_value=100)
            rf = clf.random_forest(n_estimators, max_depth)
            train_and_display_metrics(rf)
            if st.checkbox('KFold Cross Validation'):
                run_kfold(rf) 

    def train_and_display_metrics(model):
        '''
        Description: 
                Method to train the model and display its accuracy.
            
        Parameters: 
                model - A ML model (from sklearn).
    
        Returns: 
                Nothing.
        '''
        model.fit(x_train, y_train)
        y_pred_test = model.predict(x_test)
        y_pred_train = model.predict(x_train)
        st.success(f'Train accuracy = {accuracy_score(y_train, y_pred_train)*100:.5f}%')
        st.success(f'Test accuracy = {accuracy_score(y_test, y_pred_test)*100:.5f}%')
        if st.button('Show Confusion Matrix'):
            cf_matrix = confusion_matrix(y_test, y_pred_test)
            sns.heatmap(cf_matrix, annot=True)
            st.pyplot()

    def run_kfold(model):
        '''
        Description: 
                Method for running kfold cross validation.
            
        Parameters: 
                model - A ML model (from sklearn).
    
        Returns: 
                Nothing.
        '''
        cv = st.number_input(label='Choose number of folds', min_value=5, max_value=20)
        cv_score = cross_val_score(model,x,y, cv=cv)
        sum = 0
        for s in cv_score:
            sum += s
        
        avg_score = sum/cv 
        st.write(f'According to {cv} kfolds, the following test accuracies have been recorded:')
        st.dataframe(cv_score)
        st.success(f'Average test accuracy = {avg_score*100:.5f}%')

    clf = MLModels()
    x = df.iloc[:, :-1]
    y = df.iloc[:, -1]
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

    model_name = st.selectbox("Choose a model/algorithm", ["Linear Regression", "Logistic Regression", "K Nearest Neighbors", "Naive Bayes (Gaussian)", "SVM", "Decision Tree", "Random Forest"])
    run_ml_model(model_name)