File size: 6,208 Bytes
991c693
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
426d38f
991c693
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
426d38f
 
 
 
 
 
 
 
 
 
 
 
991c693
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
426d38f
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
991c693
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
import streamlit as st
import shap
import lime
from lime.lime_tabular import LimeTabularExplainer
import pandas as pd
import numpy as np
import joblib
import seaborn as sns
import xgboost as xgb
import streamlit.components.v1 as components
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
from imblearn.pipeline import Pipeline as imbPipeline


def load_dataset(name):
    if name == "Financial":
        # Replace with your dataset
        data = pd.read_csv("datasets/loan_approval_dataset.csv")
        data.columns = data.columns.str.strip()
        data = data.drop(columns=['loan_id'])
        # Remove leading/trailing spaces from the categorical column values
        data['education'] = data['education'].str.strip()
        data['self_employed'] = data['self_employed'].str.strip()
        data['loan_status'] = data['loan_status'].str.strip()
        # Encode categorical variables
        data['education'] = data['education'].map({'Graduate': 1, 'Not Graduate': 0})
        data['self_employed'] = data['self_employed'].map({'Yes': 1, 'No': 0})
        data['loan_status'] = data['loan_status'].map({'Approved': 1, 'Rejected': 0})

    elif name == "Healthcare":
        data = pd.read_csv("datasets/healthcare_dataset.csv")
        data.columns = data.columns.str.strip()
        data = data.drop_duplicates()
        data = data[data['gender'] != 'Other']
        def recategorize_smoking(smoking_status):
            if smoking_status in ['never', 'No Info']:
                return 0
            elif smoking_status == 'current':
                return 1
            elif smoking_status in ['ever', 'former', 'not current']:
                return 2

        data['smoking_history'] = data['smoking_history'].apply(recategorize_smoking)
        data['gender'] = data['gender'].map({'Male': 0, 'Female': 1})


    return data

def load_models(dataset_name):
    if dataset_name == "Financial":
        return joblib.load("models/loan_models.pkl")
    elif dataset_name == "Healthcare":
        model_path = "models/healthcare_models.pkl"
        model = joblib.load(model_path)
        return {"Random Forest": model}
    

def main():
    plt.style.use('default')
    #st.set_option('deprecation.showPyplotGlobalUse', False)
    st.title("Model Interpretability Visualization with LIME and SHAP")

    # Create different sections for each dataset
    st.subheader("1. Select a Dataset")
    dataset = st.selectbox("Choose a dataset:", ["Financial", "Healthcare"])
     
    # Perform different interpretability methods on the first dataset
    if dataset == "Financial":
        # 1. Load the dataset
        X = load_dataset(dataset)
        st.write(f"{dataset} Dataset Sample")
        st.write(X.head())

        # 2. Select interpretability method
        st.subheader("2. Select an Interpretability Method")
        method = st.selectbox("Choose an interpretability method:", ["LIME", "SHAP"])

        if method == "SHAP":
            st.subheader("3. Interpretability using SHAP")
            # SHAP analysis
            loaded_models = load_models(dataset)
            model = loaded_models['XG Boost']
            sns.set_style('whitegrid')
            X = X.drop(columns=["loan_status"]).copy()
            X = X.astype(float)
            explainer = shap.Explainer(model)
            shap_values = explainer(X)
            idx = st.slider("Select Test Instance", 0, len(X) - 1, 0)
            
            # Visualize SHAP values
            if st.button("Explain Prediction"):
                st.write("SHAP Force Plot for a Single Prediction")
                fig = shap.force_plot(explainer.expected_value, shap_values[idx].values, X.iloc[idx], matplotlib=True, show=False)
                st.pyplot(fig, bbox_inches='tight')
                st.write("SHAP Summary Plot")
                fig =shap.summary_plot(shap_values, X, show=False)
                st.pyplot(fig, bbox_inches='tight')
                st.write("SHAP Bar Plot")
                fig =shap.summary_plot(shap_values, X, plot_type="bar", show=False)
                st.pyplot(fig, bbox_inches='tight')

        elif method == "LIME":
            st.subheader("3. Interpretability using LIME")
            # Choose model type
            model_choice = st.radio("Select Model", ["Logistic Regression", 'Decision Tree', 'XG Boost', "Random Forest"])
            loaded_models = load_models(dataset)
            model = loaded_models[model_choice]
            sns.set_style('whitegrid')
            x = X.iloc[: , :-1].values
            y = X.iloc[: , -1].values
            X_train, X_test, y_train, y_test = train_test_split(x, y,
                                                                test_size=0.25,
                                                                random_state=42)
            target = ['Rejected', 'Approved']
            labels = {'0': 'Rejected', '1': 'Approved'}
            idx = st.slider("Select Test Instance", 0, len(X_test) - 1, 0)

            # Explain the prediction instance using LIME
            if st.button("Explain Prediction"):
                explainer = lime.lime_tabular.LimeTabularExplainer(
                    X_train,
                    feature_names=list(X.columns),
                    class_names=target,
                    discretize_continuous=True,
                    )
                exp = explainer.explain_instance(
                    X_test[idx],
                    model.predict_proba,
                    )

                # Visualize the explanation
                st.write("LIME Explanation")
                st.write('True label:', labels[str(y_test[idx])])
                exp.save_to_file('lime_explanation.html')
                HtmlFile = open(f'lime_explanation.html', 'r', encoding='utf-8')
                components.html(HtmlFile.read(), height=600)
                st.write("Effect of Predictors")
                fig = exp.as_pyplot_figure()
                st.pyplot(fig, bbox_inches='tight')


if __name__ == "__main__":
    main()