Spaces:
Sleeping
Sleeping
| import torch | |
| import torch.nn as nn | |
| import torch.optim as optim | |
| from sklearn.datasets import fetch_california_housing | |
| from sklearn.model_selection import train_test_split | |
| import numpy as np | |
| import pandas as pd | |
| from sklearn.preprocessing import StandardScaler,OrdinalEncoder | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.linear_model import LinearRegression, LogisticRegression | |
| from sklearn.metrics import classification_report, recall_score, precision_score, accuracy_score | |
| from sklearn.metrics import mean_squared_error | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| import zipfile | |
| from statsmodels.genmod.generalized_linear_model import GLM | |
| from statsmodels.genmod.families import Gamma | |
| from statsmodels.genmod.families.links import Log | |
| from statsmodels.tools import add_constant | |
| from pygam import LinearGAM, GammaGAM, s, f | |
| import pickle | |
| import streamlit.components.v1 as components | |
| import streamlit as st | |
| import numpy as np | |
| import pandas as pd | |
| import matplotlib.pyplot as plt | |
| from streamlit_option_menu import option_menu | |
| from pygam import LinearGAM | |
| import lime | |
| from lime import lime_tabular | |
| class HousePriceModel(nn.Module): | |
| def __init__(self, input_size): | |
| super(HousePriceModel, self).__init__() | |
| self.model = nn.Sequential( | |
| nn.Linear(input_size, 128), | |
| nn.LeakyReLU(0.2,inplace=True), | |
| nn.Linear(128, 64), | |
| nn.LeakyReLU(0.2,inplace=True), | |
| nn.Linear(64, 32), | |
| nn.LeakyReLU(0.2,inplace=True), | |
| nn.Linear(32, 1) | |
| ) | |
| def forward(self, x): | |
| x = self.model(x) | |
| return x | |
| def lime_component(X, input_data, dnn_model, gam_model, terms, y_min, y_max): | |
| """ | |
| Generate and display a LIME (Local Interpretable Model-agnostic Explanations) component. | |
| :param X: pandas.DataFrame | |
| The feature matrix used to train the explainer. | |
| :param input_data: list or numpy.array | |
| The input data point to explain. | |
| :param dnn_model: torch.nn.Module | |
| The trained neural network model. | |
| :param gam_model: object | |
| The trained Generalized Additive Model | |
| :param terms: list | |
| List of feature names. | |
| :param y_min: float | |
| Minimum y-axis value | |
| :param y_max: float | |
| Maximum y-axis value | |
| :return: None | |
| """ | |
| def nn_prediction(input_data): | |
| input_tensor = torch.FloatTensor(input_data) | |
| dnn_model.eval() | |
| with torch.no_grad(): | |
| output = dnn_model(input_tensor) | |
| return output.cpu().numpy() | |
| kernel_width = 3 | |
| explainer = lime.lime_tabular.LimeTabularExplainer(X.values,mode='regression',feature_names=terms, kernel_width=kernel_width) | |
| exp = explainer.explain_instance(np.array(input_data), nn_prediction, num_features=len(input_data)) | |
| feature_importance = exp.as_list() | |
| intercept = exp.intercept[0] | |
| formula = f"> _**y =** {intercept:.4f}" | |
| i = 0 | |
| for feature, importance in feature_importance: | |
| if importance > 0: | |
| formula += f" + {importance:.4f} * **{terms[i]}**" | |
| else: | |
| formula += f" - {abs(importance):.4f} * **{terms[i]}**" | |
| i += 1 | |
| formula += '_' | |
| st.markdown(formula) | |
| lime_graph = exp.as_pyplot_figure() | |
| st.pyplot(lime_graph) | |
| def spline_component(X, input_data, dnn_model, gam_model, terms, y_min, y_max): | |
| """ | |
| Generate the spline functions in a component. | |
| :param X: pandas.DataFrame | |
| The feature matrix used to train the explainer. | |
| :param input_data: list or numpy.array | |
| The input data point to explain. | |
| :param dnn_model: torch.nn.Module | |
| The trained neural network model. | |
| :param gam_model: object | |
| The trained Generalized Additive Model | |
| :param terms: list | |
| List of feature names. | |
| :param y_min: float | |
| Minimum y-axis value | |
| :param y_max: float | |
| Maximum y-axis value | |
| :return: None | |
| """ | |
| col1, col2 = st.columns(2) | |
| for i in range(len(terms)): | |
| XX = gam_model.generate_X_grid(term=i) | |
| pdep, confi = gam_model.partial_dependence(term=i, X=XX, width=0.95) | |
| fig, ax = plt.subplots(figsize=(6, 6)) | |
| # Plot partial dependence | |
| ax.plot(XX[:, i], pdep, label='Partial Dependence') | |
| ax.plot(XX[:, i], confi, c='r', ls='--', label='Confidence Interval') | |
| # Find y-value corresponding to user input | |
| user_x = input_data[i] | |
| user_y = np.interp(user_x, XX[:, i], pdep) | |
| # Plot vertical and horizontal lines | |
| ax.axvline(x=user_x, color='b', linestyle='--', label='Model Input',ymin=0, ymax=(user_y-y_min)/(y_max-y_min)) | |
| ax.axhline(y=user_y, color='b', linestyle='--', xmin=0, xmax=(user_x - XX[0, i]) / (XX[-1, i] - XX[0, i])) | |
| ax.annotate(f'{user_y:.2f}', (user_x, user_y), textcoords="offset points", xytext=(0,20), ha='center') | |
| ax.set_title(f'{terms[i]}') | |
| ax.set_xlabel(terms[i]) | |
| ax.set_ylabel('Partial Dependence') | |
| ax.legend(loc='best', fontsize='x-small') | |
| # Set consistent y-axis limits | |
| ax.set_ylim(y_min, y_max) | |
| plt.tight_layout(pad=1.5) | |
| # Alternate between columns | |
| if i % 2 == 0: | |
| with col1: | |
| st.write(f"Spline function: {terms[i]}:") | |
| st.pyplot(fig) | |
| else: | |
| with col2: | |
| st.write(f"Spline function: {terms[i]}:") | |
| st.pyplot(fig) | |
| plt.close(fig) | |
| def prediction_component(X, input_data, dnn_model, gam_model, terms, y_min, y_max, scale=1): | |
| """ | |
| Generates the main component for the spline functions and LIME based upon the GAM and | |
| the neural network. | |
| :param X: pandas.DataFrame | |
| The feature matrix used to train the explainer. | |
| :param input_data: list or numpy.array | |
| The input data point to explain. | |
| :param dnn_model: torch.nn.Module | |
| The trained neural network model. | |
| :param gam_model: object | |
| The trained Generalized Additive Model | |
| :param terms: list | |
| List of feature names. | |
| :param y_min: float | |
| Minimum y-axis value | |
| :param y_max: float | |
| Maximum y-axis value | |
| :param scale: int | |
| The scale of the model's dataset's price | |
| :return: None | |
| """ | |
| input_array = np.array(input_data).reshape(1, -1) | |
| dnn_prediction = dnn_model(torch.FloatTensor(input_array))[0][0] | |
| gam_prediction = gam_model.predict(input_array)[0] | |
| st.markdown("# Predictions") | |
| st.write(f"- _General Additive Model Prediction:_ ${gam_prediction*scale:.2f}") | |
| st.write(f"- _Neural Network Prediction:_ ${dnn_prediction*scale:.2f}") | |
| st.markdown("----------------") | |
| st.markdown("## Spline Functions") | |
| spline_component(X, input_data, dnn_model, gam_model, terms, y_min, y_max) | |
| st.markdown("----------------") | |
| st.markdown("## LIME Explanation for Neural Network") | |
| lime_component(X, input_data, dnn_model, gam_model, terms, y_min, y_max) | |
| def show(): | |
| st.markdown("# _Explainable AI for Housing Costs_") | |
| st.markdown(""" | |
| ## _Models_ | |
| ### California Housing Dataset: | |
| > This dataset is more concentrated and is far less sparse as the dataset includes a smaller population from 1990 and only in California | |
| ### HUD Housing Dataset: | |
| > This dataset is far more sparse and is calculated over the entire country with data from all the different regions of the country | |
| """) | |
| st.markdown("----------------") | |
| model_versions = ['California Housing Dataset', 'HUD Housing Dataset'] | |
| value = st.selectbox(f"Select model version", model_versions) | |
| st.markdown("----------------") | |
| st.write('Generate model predictions:') | |
| if value == 'HUD Housing Dataset': | |
| dnn_model = HousePriceModel(8) | |
| dnn_model.load_state_dict(torch.load('models/dnn_model_hud.pth')) | |
| dnn_model.eval() | |
| with open('models/gam_model_hud.pkl', 'rb') as file: | |
| gam_model = pickle.load(file) | |
| dnn_mse = 77202161664.0000 | |
| gam_mse = 57308274833.9465 | |
| input_data = [None] * 8 | |
| terms = [ | |
| 'City/Suburban Status', | |
| 'Census Region', | |
| 'Area median income (average)', | |
| '# of bedrooms in unit', | |
| 'Age of the house (years)', | |
| '# of rooms in unit', | |
| '# of Persons in Household', | |
| 'Monthly utility cost' | |
| ] | |
| region_code = { | |
| 'Northeast': 1, | |
| 'Midwest': 2, | |
| 'South': 3, | |
| 'West': 4, | |
| } | |
| metro_code = { | |
| 'Central cities of metropolitan areas': 1, | |
| 'Inside metropolitan area, but not in central city': 2, | |
| 'Inside metropolitan area, but not in central city - rural': 3, | |
| 'Outside metropolitan areas, urbanized': 4, | |
| 'Outside metropolitan areas, rural': 5, | |
| } | |
| with st.sidebar: | |
| st.title("Model Inputs") | |
| value = st.selectbox(f"{terms[0]}", metro_code.keys()) | |
| input_data[0] =metro_code[value] | |
| value = st.selectbox(f"{terms[0]}", region_code.keys()) | |
| input_data[1] =region_code[value] | |
| value = st.number_input(f"{terms[2]}", value=84200) | |
| input_data[2] =value | |
| value = st.number_input(f"{terms[3]}", value=4) | |
| input_data[3] =value | |
| value = st.number_input(f"{terms[4]}", value=9) | |
| input_data[4] =value | |
| value = st.number_input(f"{terms[5]}", value=8) | |
| input_data[5] =value | |
| value = st.number_input(f"{terms[6]}", value=3) | |
| input_data[6] =value | |
| value = st.number_input(f"{terms[7]}", value=300) | |
| input_data[7] =value | |
| df = pd.read_csv('data/hud_dataset.csv', index_col=False) | |
| X, y = df.drop(columns=['VALUE']), df['VALUE'] | |
| if st.button("Predict"): | |
| st.markdown("# Model Loss") | |
| st.write(f"- _Mean Squared Error (Generalized Additive Model):_ {gam_mse:.3f}") | |
| st.write(f"- _Mean Squared Error (Neural Network Model):_ {dnn_mse:.3f}") | |
| st.markdown("----------------") | |
| prediction_component(X, input_data, dnn_model, gam_model, terms, y_min = -500000, y_max = 500000) | |
| else: | |
| dnn_model = HousePriceModel(8) | |
| dnn_model.load_state_dict(torch.load('models/dnn_model_california.pth')) | |
| dnn_model.eval() | |
| with open('models/gam_model_california.pkl', 'rb') as file: | |
| gam_model = pickle.load(file) | |
| dnn_mse = 0.9678 | |
| gam_mse = 0.3081 | |
| input_data = [None] * 8 | |
| terms = [ | |
| 'Median Income', | |
| 'House Age (years)', | |
| 'Average Rooms', | |
| 'Average Bedrooms', | |
| 'Population (average of census block group per county)', | |
| 'Average Occupancy', | |
| 'Latitude', | |
| 'Longitude' | |
| ] | |
| counties = pd.read_csv('data/california_counties.csv') | |
| distinct_counties = list(counties['County'].unique()) | |
| with st.sidebar: | |
| st.title("Model Inputs") | |
| county = st.selectbox(f"{terms[0]}", distinct_counties) | |
| counties = counties[counties['County'] == county] | |
| county_vals = counties.values[0] | |
| population = county_vals[4] / 442 | |
| lat = county_vals[2] | |
| long = county_vals[3] | |
| value = st.number_input(f"{terms[0]}", value=84200) | |
| input_data[0] =value/10000 | |
| value = st.number_input(f"{terms[1]}", value=4) | |
| input_data[1] =value | |
| value = st.number_input(f"{terms[2]}", value=9) | |
| input_data[2] =value | |
| value = st.number_input(f"{terms[3]}", value=8) | |
| input_data[3] =value | |
| input_data[4] = population | |
| value = st.number_input(f"{terms[5]}", value=3) | |
| input_data[5] =value | |
| input_data[6] = lat | |
| input_data[7] = long | |
| housing = fetch_california_housing() | |
| X, y = pd.DataFrame(housing.data), pd.DataFrame(housing.target) | |
| X.columns = housing.feature_names | |
| if st.button("Predict"): | |
| st.markdown("# Model Loss") | |
| st.write(f"- _Mean Squared Error (Generalized Additive Model):_ {gam_mse:.3f}") | |
| st.write(f"- _Mean Squared Error (Neural Network Model):_ {dnn_mse:.3f}") | |
| st.markdown("----------------") | |
| prediction_component(X, input_data, dnn_model, gam_model, terms, y_min = -10, y_max = 10, scale=100000) | |
| if __name__ == '__main__': | |
| st.set_page_config(page_title="Generalized Additive Models", page_icon="🚀") | |
| page = option_menu( | |
| menu_title=None, | |
| options=["Home", "About"], | |
| icons=["house", "book"], | |
| menu_icon="cast", | |
| default_index=0, | |
| orientation="horizontal", | |
| ) | |
| if page == "Home": | |
| show() | |
| elif page == "About": | |
| st.switch_page("pages/about.py") | |
| st.info("This app is for explaining the problem domain using Generalized Additive Models") | |