keesephillips's picture
Upload 19 files
a7e459d verified
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler,OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import classification_report, recall_score, precision_score, accuracy_score
from sklearn.metrics import mean_squared_error
import matplotlib.pyplot as plt
import seaborn as sns
import zipfile
from statsmodels.genmod.generalized_linear_model import GLM
from statsmodels.genmod.families import Gamma
from statsmodels.genmod.families.links import Log
from statsmodels.tools import add_constant
from pygam import LinearGAM, GammaGAM, s, f
import pickle
import streamlit.components.v1 as components
import streamlit as st
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from streamlit_option_menu import option_menu
from pygam import LinearGAM
import lime
from lime import lime_tabular
class HousePriceModel(nn.Module):
def __init__(self, input_size):
super(HousePriceModel, self).__init__()
self.model = nn.Sequential(
nn.Linear(input_size, 128),
nn.LeakyReLU(0.2,inplace=True),
nn.Linear(128, 64),
nn.LeakyReLU(0.2,inplace=True),
nn.Linear(64, 32),
nn.LeakyReLU(0.2,inplace=True),
nn.Linear(32, 1)
)
def forward(self, x):
x = self.model(x)
return x
def lime_component(X, input_data, dnn_model, gam_model, terms, y_min, y_max):
"""
Generate and display a LIME (Local Interpretable Model-agnostic Explanations) component.
:param X: pandas.DataFrame
The feature matrix used to train the explainer.
:param input_data: list or numpy.array
The input data point to explain.
:param dnn_model: torch.nn.Module
The trained neural network model.
:param gam_model: object
The trained Generalized Additive Model
:param terms: list
List of feature names.
:param y_min: float
Minimum y-axis value
:param y_max: float
Maximum y-axis value
:return: None
"""
def nn_prediction(input_data):
input_tensor = torch.FloatTensor(input_data)
dnn_model.eval()
with torch.no_grad():
output = dnn_model(input_tensor)
return output.cpu().numpy()
kernel_width = 3
explainer = lime.lime_tabular.LimeTabularExplainer(X.values,mode='regression',feature_names=terms, kernel_width=kernel_width)
exp = explainer.explain_instance(np.array(input_data), nn_prediction, num_features=len(input_data))
feature_importance = exp.as_list()
intercept = exp.intercept[0]
formula = f"> _**y =** {intercept:.4f}"
i = 0
for feature, importance in feature_importance:
if importance > 0:
formula += f" + {importance:.4f} * **{terms[i]}**"
else:
formula += f" - {abs(importance):.4f} * **{terms[i]}**"
i += 1
formula += '_'
st.markdown(formula)
lime_graph = exp.as_pyplot_figure()
st.pyplot(lime_graph)
def spline_component(X, input_data, dnn_model, gam_model, terms, y_min, y_max):
"""
Generate the spline functions in a component.
:param X: pandas.DataFrame
The feature matrix used to train the explainer.
:param input_data: list or numpy.array
The input data point to explain.
:param dnn_model: torch.nn.Module
The trained neural network model.
:param gam_model: object
The trained Generalized Additive Model
:param terms: list
List of feature names.
:param y_min: float
Minimum y-axis value
:param y_max: float
Maximum y-axis value
:return: None
"""
col1, col2 = st.columns(2)
for i in range(len(terms)):
XX = gam_model.generate_X_grid(term=i)
pdep, confi = gam_model.partial_dependence(term=i, X=XX, width=0.95)
fig, ax = plt.subplots(figsize=(6, 6))
# Plot partial dependence
ax.plot(XX[:, i], pdep, label='Partial Dependence')
ax.plot(XX[:, i], confi, c='r', ls='--', label='Confidence Interval')
# Find y-value corresponding to user input
user_x = input_data[i]
user_y = np.interp(user_x, XX[:, i], pdep)
# Plot vertical and horizontal lines
ax.axvline(x=user_x, color='b', linestyle='--', label='Model Input',ymin=0, ymax=(user_y-y_min)/(y_max-y_min))
ax.axhline(y=user_y, color='b', linestyle='--', xmin=0, xmax=(user_x - XX[0, i]) / (XX[-1, i] - XX[0, i]))
ax.annotate(f'{user_y:.2f}', (user_x, user_y), textcoords="offset points", xytext=(0,20), ha='center')
ax.set_title(f'{terms[i]}')
ax.set_xlabel(terms[i])
ax.set_ylabel('Partial Dependence')
ax.legend(loc='best', fontsize='x-small')
# Set consistent y-axis limits
ax.set_ylim(y_min, y_max)
plt.tight_layout(pad=1.5)
# Alternate between columns
if i % 2 == 0:
with col1:
st.write(f"Spline function: {terms[i]}:")
st.pyplot(fig)
else:
with col2:
st.write(f"Spline function: {terms[i]}:")
st.pyplot(fig)
plt.close(fig)
def prediction_component(X, input_data, dnn_model, gam_model, terms, y_min, y_max, scale=1):
"""
Generates the main component for the spline functions and LIME based upon the GAM and
the neural network.
:param X: pandas.DataFrame
The feature matrix used to train the explainer.
:param input_data: list or numpy.array
The input data point to explain.
:param dnn_model: torch.nn.Module
The trained neural network model.
:param gam_model: object
The trained Generalized Additive Model
:param terms: list
List of feature names.
:param y_min: float
Minimum y-axis value
:param y_max: float
Maximum y-axis value
:param scale: int
The scale of the model's dataset's price
:return: None
"""
input_array = np.array(input_data).reshape(1, -1)
dnn_prediction = dnn_model(torch.FloatTensor(input_array))[0][0]
gam_prediction = gam_model.predict(input_array)[0]
st.markdown("# Predictions")
st.write(f"- _General Additive Model Prediction:_ ${gam_prediction*scale:.2f}")
st.write(f"- _Neural Network Prediction:_ ${dnn_prediction*scale:.2f}")
st.markdown("----------------")
st.markdown("## Spline Functions")
spline_component(X, input_data, dnn_model, gam_model, terms, y_min, y_max)
st.markdown("----------------")
st.markdown("## LIME Explanation for Neural Network")
lime_component(X, input_data, dnn_model, gam_model, terms, y_min, y_max)
def show():
st.markdown("# _Explainable AI for Housing Costs_")
st.markdown("""
## _Models_
### California Housing Dataset:
> This dataset is more concentrated and is far less sparse as the dataset includes a smaller population from 1990 and only in California
### HUD Housing Dataset:
> This dataset is far more sparse and is calculated over the entire country with data from all the different regions of the country
""")
st.markdown("----------------")
model_versions = ['California Housing Dataset', 'HUD Housing Dataset']
value = st.selectbox(f"Select model version", model_versions)
st.markdown("----------------")
st.write('Generate model predictions:')
if value == 'HUD Housing Dataset':
dnn_model = HousePriceModel(8)
dnn_model.load_state_dict(torch.load('models/dnn_model_hud.pth'))
dnn_model.eval()
with open('models/gam_model_hud.pkl', 'rb') as file:
gam_model = pickle.load(file)
dnn_mse = 77202161664.0000
gam_mse = 57308274833.9465
input_data = [None] * 8
terms = [
'City/Suburban Status',
'Census Region',
'Area median income (average)',
'# of bedrooms in unit',
'Age of the house (years)',
'# of rooms in unit',
'# of Persons in Household',
'Monthly utility cost'
]
region_code = {
'Northeast': 1,
'Midwest': 2,
'South': 3,
'West': 4,
}
metro_code = {
'Central cities of metropolitan areas': 1,
'Inside metropolitan area, but not in central city': 2,
'Inside metropolitan area, but not in central city - rural': 3,
'Outside metropolitan areas, urbanized': 4,
'Outside metropolitan areas, rural': 5,
}
with st.sidebar:
st.title("Model Inputs")
value = st.selectbox(f"{terms[0]}", metro_code.keys())
input_data[0] =metro_code[value]
value = st.selectbox(f"{terms[0]}", region_code.keys())
input_data[1] =region_code[value]
value = st.number_input(f"{terms[2]}", value=84200)
input_data[2] =value
value = st.number_input(f"{terms[3]}", value=4)
input_data[3] =value
value = st.number_input(f"{terms[4]}", value=9)
input_data[4] =value
value = st.number_input(f"{terms[5]}", value=8)
input_data[5] =value
value = st.number_input(f"{terms[6]}", value=3)
input_data[6] =value
value = st.number_input(f"{terms[7]}", value=300)
input_data[7] =value
df = pd.read_csv('data/hud_dataset.csv', index_col=False)
X, y = df.drop(columns=['VALUE']), df['VALUE']
if st.button("Predict"):
st.markdown("# Model Loss")
st.write(f"- _Mean Squared Error (Generalized Additive Model):_ {gam_mse:.3f}")
st.write(f"- _Mean Squared Error (Neural Network Model):_ {dnn_mse:.3f}")
st.markdown("----------------")
prediction_component(X, input_data, dnn_model, gam_model, terms, y_min = -500000, y_max = 500000)
else:
dnn_model = HousePriceModel(8)
dnn_model.load_state_dict(torch.load('models/dnn_model_california.pth'))
dnn_model.eval()
with open('models/gam_model_california.pkl', 'rb') as file:
gam_model = pickle.load(file)
dnn_mse = 0.9678
gam_mse = 0.3081
input_data = [None] * 8
terms = [
'Median Income',
'House Age (years)',
'Average Rooms',
'Average Bedrooms',
'Population (average of census block group per county)',
'Average Occupancy',
'Latitude',
'Longitude'
]
counties = pd.read_csv('data/california_counties.csv')
distinct_counties = list(counties['County'].unique())
with st.sidebar:
st.title("Model Inputs")
county = st.selectbox(f"{terms[0]}", distinct_counties)
counties = counties[counties['County'] == county]
county_vals = counties.values[0]
population = county_vals[4] / 442
lat = county_vals[2]
long = county_vals[3]
value = st.number_input(f"{terms[0]}", value=84200)
input_data[0] =value/10000
value = st.number_input(f"{terms[1]}", value=4)
input_data[1] =value
value = st.number_input(f"{terms[2]}", value=9)
input_data[2] =value
value = st.number_input(f"{terms[3]}", value=8)
input_data[3] =value
input_data[4] = population
value = st.number_input(f"{terms[5]}", value=3)
input_data[5] =value
input_data[6] = lat
input_data[7] = long
housing = fetch_california_housing()
X, y = pd.DataFrame(housing.data), pd.DataFrame(housing.target)
X.columns = housing.feature_names
if st.button("Predict"):
st.markdown("# Model Loss")
st.write(f"- _Mean Squared Error (Generalized Additive Model):_ {gam_mse:.3f}")
st.write(f"- _Mean Squared Error (Neural Network Model):_ {dnn_mse:.3f}")
st.markdown("----------------")
prediction_component(X, input_data, dnn_model, gam_model, terms, y_min = -10, y_max = 10, scale=100000)
if __name__ == '__main__':
st.set_page_config(page_title="Generalized Additive Models", page_icon="🚀")
page = option_menu(
menu_title=None,
options=["Home", "About"],
icons=["house", "book"],
menu_icon="cast",
default_index=0,
orientation="horizontal",
)
if page == "Home":
show()
elif page == "About":
st.switch_page("pages/about.py")
st.info("This app is for explaining the problem domain using Generalized Additive Models")