Spaces:

keesephillips
/

xai_final_project

Sleeping

App Files Files Community

xai_final_project / pages /main.py

keesephillips

Upload 19 files

a7e459d verified over 1 year ago

raw

history blame contribute delete

13.7 kB

	import torch
	import torch.nn as nn
	import torch.optim as optim
	from sklearn.datasets import fetch_california_housing
	from sklearn.model_selection import train_test_split
	import numpy as np
	import pandas as pd
	from sklearn.preprocessing import StandardScaler,OrdinalEncoder
	from sklearn.model_selection import train_test_split
	from sklearn.linear_model import LinearRegression, LogisticRegression
	from sklearn.metrics import classification_report, recall_score, precision_score, accuracy_score
	from sklearn.metrics import mean_squared_error
	import matplotlib.pyplot as plt
	import seaborn as sns
	import zipfile
	from statsmodels.genmod.generalized_linear_model import GLM
	from statsmodels.genmod.families import Gamma
	from statsmodels.genmod.families.links import Log
	from statsmodels.tools import add_constant
	from pygam import LinearGAM, GammaGAM, s, f
	import pickle
	import streamlit.components.v1 as components
	import streamlit as st
	import numpy as np
	import pandas as pd
	import matplotlib.pyplot as plt
	from streamlit_option_menu import option_menu
	from pygam import LinearGAM
	import lime
	from lime import lime_tabular


	class HousePriceModel(nn.Module):
	def __init__(self, input_size):
	super(HousePriceModel, self).__init__()
	self.model = nn.Sequential(
	nn.Linear(input_size, 128),
	nn.LeakyReLU(0.2,inplace=True),
	nn.Linear(128, 64),
	nn.LeakyReLU(0.2,inplace=True),
	nn.Linear(64, 32),
	nn.LeakyReLU(0.2,inplace=True),
	nn.Linear(32, 1)
	)

	def forward(self, x):
	x = self.model(x)
	return x


	def lime_component(X, input_data, dnn_model, gam_model, terms, y_min, y_max):
	"""
	Generate and display a LIME (Local Interpretable Model-agnostic Explanations) component.

	:param X: pandas.DataFrame
	The feature matrix used to train the explainer.
	:param input_data: list or numpy.array
	The input data point to explain.
	:param dnn_model: torch.nn.Module
	The trained neural network model.
	:param gam_model: object
	The trained Generalized Additive Model
	:param terms: list
	List of feature names.
	:param y_min: float
	Minimum y-axis value
	:param y_max: float
	Maximum y-axis value

	:return: None
	"""
	def nn_prediction(input_data):
	input_tensor = torch.FloatTensor(input_data)
	dnn_model.eval()
	with torch.no_grad():
	output = dnn_model(input_tensor)

	return output.cpu().numpy()

	kernel_width = 3
	explainer = lime.lime_tabular.LimeTabularExplainer(X.values,mode='regression',feature_names=terms, kernel_width=kernel_width)
	exp = explainer.explain_instance(np.array(input_data), nn_prediction, num_features=len(input_data))

	feature_importance = exp.as_list()
	intercept = exp.intercept[0]
	formula = f"> _y = {intercept:.4f}"
	i = 0
	for feature, importance in feature_importance:
	if importance > 0:
	formula += f" + {importance:.4f} * {terms[i]}"
	else:
	formula += f" - {abs(importance):.4f} * {terms[i]}"
	i += 1
	formula += '_'
	st.markdown(formula)

	lime_graph = exp.as_pyplot_figure()
	st.pyplot(lime_graph)


	def spline_component(X, input_data, dnn_model, gam_model, terms, y_min, y_max):
	"""
	Generate the spline functions in a component.

	:param X: pandas.DataFrame
	The feature matrix used to train the explainer.
	:param input_data: list or numpy.array
	The input data point to explain.
	:param dnn_model: torch.nn.Module
	The trained neural network model.
	:param gam_model: object
	The trained Generalized Additive Model
	:param terms: list
	List of feature names.
	:param y_min: float
	Minimum y-axis value
	:param y_max: float
	Maximum y-axis value

	:return: None
	"""
	col1, col2 = st.columns(2)

	for i in range(len(terms)):
	XX = gam_model.generate_X_grid(term=i)
	pdep, confi = gam_model.partial_dependence(term=i, X=XX, width=0.95)
	fig, ax = plt.subplots(figsize=(6, 6))

	# Plot partial dependence
	ax.plot(XX[:, i], pdep, label='Partial Dependence')
	ax.plot(XX[:, i], confi, c='r', ls='--', label='Confidence Interval')

	# Find y-value corresponding to user input
	user_x = input_data[i]
	user_y = np.interp(user_x, XX[:, i], pdep)

	# Plot vertical and horizontal lines
	ax.axvline(x=user_x, color='b', linestyle='--', label='Model Input',ymin=0, ymax=(user_y-y_min)/(y_max-y_min))
	ax.axhline(y=user_y, color='b', linestyle='--', xmin=0, xmax=(user_x - XX[0, i]) / (XX[-1, i] - XX[0, i]))
	ax.annotate(f'{user_y:.2f}', (user_x, user_y), textcoords="offset points", xytext=(0,20), ha='center')

	ax.set_title(f'{terms[i]}')
	ax.set_xlabel(terms[i])
	ax.set_ylabel('Partial Dependence')
	ax.legend(loc='best', fontsize='x-small')

	# Set consistent y-axis limits
	ax.set_ylim(y_min, y_max)
	plt.tight_layout(pad=1.5)

	# Alternate between columns
	if i % 2 == 0:
	with col1:
	st.write(f"Spline function: {terms[i]}:")
	st.pyplot(fig)

	else:
	with col2:
	st.write(f"Spline function: {terms[i]}:")
	st.pyplot(fig)

	plt.close(fig)

	def prediction_component(X, input_data, dnn_model, gam_model, terms, y_min, y_max, scale=1):
	"""
	Generates the main component for the spline functions and LIME based upon the GAM and
	the neural network.

	:param X: pandas.DataFrame
	The feature matrix used to train the explainer.
	:param input_data: list or numpy.array
	The input data point to explain.
	:param dnn_model: torch.nn.Module
	The trained neural network model.
	:param gam_model: object
	The trained Generalized Additive Model
	:param terms: list
	List of feature names.
	:param y_min: float
	Minimum y-axis value
	:param y_max: float
	Maximum y-axis value
	:param scale: int
	The scale of the model's dataset's price

	:return: None
	"""
	input_array = np.array(input_data).reshape(1, -1)

	dnn_prediction = dnn_model(torch.FloatTensor(input_array))[0][0]
	gam_prediction = gam_model.predict(input_array)[0]

	st.markdown("# Predictions")
	st.write(f"- _General Additive Model Prediction:_ ${gam_prediction*scale:.2f}")
	st.write(f"- _Neural Network Prediction:_ ${dnn_prediction*scale:.2f}")
	st.markdown("----------------")

	st.markdown("## Spline Functions")
	spline_component(X, input_data, dnn_model, gam_model, terms, y_min, y_max)
	st.markdown("----------------")
	st.markdown("## LIME Explanation for Neural Network")

	lime_component(X, input_data, dnn_model, gam_model, terms, y_min, y_max)


	def show():
	st.markdown("# _Explainable AI for Housing Costs_")
	st.markdown("""
	## _Models_
	### California Housing Dataset:
	> This dataset is more concentrated and is far less sparse as the dataset includes a smaller population from 1990 and only in California
	### HUD Housing Dataset:
	> This dataset is far more sparse and is calculated over the entire country with data from all the different regions of the country
	""")
	st.markdown("----------------")

	model_versions = ['California Housing Dataset', 'HUD Housing Dataset']
	value = st.selectbox(f"Select model version", model_versions)
	st.markdown("----------------")

	st.write('Generate model predictions:')
	if value == 'HUD Housing Dataset':
	dnn_model = HousePriceModel(8)
	dnn_model.load_state_dict(torch.load('models/dnn_model_hud.pth'))
	dnn_model.eval()

	with open('models/gam_model_hud.pkl', 'rb') as file:
	gam_model = pickle.load(file)

	dnn_mse = 77202161664.0000
	gam_mse = 57308274833.9465

	input_data = [None] * 8

	terms = [
	'City/Suburban Status',
	'Census Region',
	'Area median income (average)',
	'# of bedrooms in unit',
	'Age of the house (years)',
	'# of rooms in unit',
	'# of Persons in Household',
	'Monthly utility cost'
	]

	region_code = {
	'Northeast': 1,
	'Midwest': 2,
	'South': 3,
	'West': 4,
	}

	metro_code = {
	'Central cities of metropolitan areas': 1,
	'Inside metropolitan area, but not in central city': 2,
	'Inside metropolitan area, but not in central city - rural': 3,
	'Outside metropolitan areas, urbanized': 4,
	'Outside metropolitan areas, rural': 5,
	}

	with st.sidebar:
	st.title("Model Inputs")
	value = st.selectbox(f"{terms[0]}", metro_code.keys())
	input_data[0] =metro_code[value]
	value = st.selectbox(f"{terms[0]}", region_code.keys())
	input_data[1] =region_code[value]
	value = st.number_input(f"{terms[2]}", value=84200)
	input_data[2] =value
	value = st.number_input(f"{terms[3]}", value=4)
	input_data[3] =value
	value = st.number_input(f"{terms[4]}", value=9)
	input_data[4] =value
	value = st.number_input(f"{terms[5]}", value=8)
	input_data[5] =value
	value = st.number_input(f"{terms[6]}", value=3)
	input_data[6] =value
	value = st.number_input(f"{terms[7]}", value=300)
	input_data[7] =value

	df = pd.read_csv('data/hud_dataset.csv', index_col=False)
	X, y = df.drop(columns=['VALUE']), df['VALUE']

	if st.button("Predict"):
	st.markdown("# Model Loss")
	st.write(f"- _Mean Squared Error (Generalized Additive Model):_ {gam_mse:.3f}")
	st.write(f"- _Mean Squared Error (Neural Network Model):_ {dnn_mse:.3f}")
	st.markdown("----------------")
	prediction_component(X, input_data, dnn_model, gam_model, terms, y_min = -500000, y_max = 500000)

	else:
	dnn_model = HousePriceModel(8)
	dnn_model.load_state_dict(torch.load('models/dnn_model_california.pth'))
	dnn_model.eval()

	with open('models/gam_model_california.pkl', 'rb') as file:
	gam_model = pickle.load(file)

	dnn_mse = 0.9678
	gam_mse = 0.3081

	input_data = [None] * 8

	terms = [
	'Median Income',
	'House Age (years)',
	'Average Rooms',
	'Average Bedrooms',
	'Population (average of census block group per county)',
	'Average Occupancy',
	'Latitude',
	'Longitude'
	]

	counties = pd.read_csv('data/california_counties.csv')
	distinct_counties = list(counties['County'].unique())

	with st.sidebar:
	st.title("Model Inputs")
	county = st.selectbox(f"{terms[0]}", distinct_counties)
	counties = counties[counties['County'] == county]
	county_vals = counties.values[0]
	population = county_vals[4] / 442
	lat = county_vals[2]
	long = county_vals[3]
	value = st.number_input(f"{terms[0]}", value=84200)
	input_data[0] =value/10000
	value = st.number_input(f"{terms[1]}", value=4)
	input_data[1] =value
	value = st.number_input(f"{terms[2]}", value=9)
	input_data[2] =value
	value = st.number_input(f"{terms[3]}", value=8)
	input_data[3] =value
	input_data[4] = population
	value = st.number_input(f"{terms[5]}", value=3)
	input_data[5] =value
	input_data[6] = lat
	input_data[7] = long

	housing = fetch_california_housing()
	X, y = pd.DataFrame(housing.data), pd.DataFrame(housing.target)
	X.columns = housing.feature_names

	if st.button("Predict"):
	st.markdown("# Model Loss")
	st.write(f"- _Mean Squared Error (Generalized Additive Model):_ {gam_mse:.3f}")
	st.write(f"- _Mean Squared Error (Neural Network Model):_ {dnn_mse:.3f}")
	st.markdown("----------------")
	prediction_component(X, input_data, dnn_model, gam_model, terms, y_min = -10, y_max = 10, scale=100000)

	if __name__ == '__main__':
	st.set_page_config(page_title="Generalized Additive Models", page_icon="🚀")
	page = option_menu(
	menu_title=None,
	options=["Home", "About"],
	icons=["house", "book"],
	menu_icon="cast",
	default_index=0,
	orientation="horizontal",
	)
	if page == "Home":
	show()
	elif page == "About":
	st.switch_page("pages/about.py")

	st.info("This app is for explaining the problem domain using Generalized Additive Models")