great / src /streamlit_app.py
gaetanbrison's picture
Update src/streamlit_app.py
b6646e5 verified
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import streamlit as st
# MLflow and DagsHub initialization
import mlflow
import mlflow.sklearn
import dagshub
import shap
# Initialize DagsHub with MLflow integration
dagshub.init(repo_owner='G.Brison', repo_name='test', mlflow=True)
#from ydata_profiling import ProfileReport
#from streamlit_pandas_profiling import st_profile_report
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn import metrics
# Streamlit page config
st.set_page_config(
page_title="California Housing Dashboard 🏑",
layout="centered",
page_icon="🏑",
)
# Sidebar setup
st.sidebar.title("California - Real Estate Agency 🏑")
page = st.sidebar.selectbox(
"Select Page",
[
"Introduction πŸ“˜",
"Visualization πŸ“Š",
"Automated Report πŸ“‘",
"Prediction πŸ€–",
"Explainability πŸ”",
"MLflow Runs πŸ“ˆ",
],
)
# Display header image
st.image("https://huggingface.co/spaces/NYU-DS-4-Everyone/great/resolve/main/src/house2.png")
df = pd.read_csv("https://huggingface.co/spaces/NYU-DS-4-Everyone/great/resolve/main/src/housing.csv")
# Introduction Page
if page == "Introduction πŸ“˜":
st.subheader("01 Introduction πŸ“˜")
st.markdown("##### Data Preview")
rows = st.slider("Select a number of rows to display", 5, 20, 5)
st.dataframe(df.head(rows))
st.markdown("##### Missing values")
missing = df.isnull().sum()
st.write(missing)
if missing.sum() == 0:
st.success("βœ… No missing values found")
else:
st.warning("⚠️ You have missing values")
st.markdown("##### πŸ“ˆ Summary Statistics")
if st.button("Show Describe Table"):
st.dataframe(df.describe())
# Visualization Page
elif page == "Visualization πŸ“Š":
st.subheader("02 Data Viz πŸ“Š")
col_x = st.selectbox("Select X-axis variable", df.columns, index=0)
col_y = st.selectbox("Select Y-axis variable", df.columns, index=1)
tab1, tab2, tab3 = st.tabs(["Bar Chart πŸ“Š", "Line Chart πŸ“ˆ", "Correlation Heatmap πŸ”₯"])
with tab1:
st.subheader("Bar Chart")
st.bar_chart(df[[col_x, col_y]].sort_values(by=col_x), use_container_width=True)
with tab2:
st.subheader("Line Chart")
st.line_chart(df[[col_x, col_y]].sort_values(by=col_x), use_container_width=True)
with tab3:
st.subheader("Correlation Matrix")
df_numeric = df.select_dtypes(include=np.number)
fig_corr, ax_corr = plt.subplots(figsize=(18, 14))
sns.heatmap(df_numeric.corr(), annot=True, fmt=".2f", cmap='coolwarm', ax=ax_corr)
st.pyplot(fig_corr)
# Automated Report Page
#elif page == "Automated Report πŸ“‘":
# st.subheader("03 Automated Report πŸ“‘")
# if st.button("Generate Report"):
# with st.spinner("Generating report..."):
# profile = ProfileReport(df, title="California Housing Report", explorative=True, minimal=True)
# st_profile_report(profile)
# export = profile.to_html()
# st.download_button(
# label="πŸ“₯ Download full Report",
# data=export,
# file_name="california_housing_report.html",
# mime='text/html',
# )
# Prediction Page
elif page == "Prediction πŸ€–":
st.subheader("04 Prediction with MLflow Tracking πŸ€–")
# Data preprocessing
df2 = df.dropna().copy()
le = LabelEncoder()
df2["ocean_proximity"] = le.fit_transform(df2["ocean_proximity"])
# Feature/Target selection
list_var = df2.columns.tolist()
features_selection = st.sidebar.multiselect("Select Features (X)", list_var, default=list_var)
target_selection = st.sidebar.selectbox("Select Target Variable (Y)", list_var, index=list_var.index('median_house_value') if 'median_house_value' in list_var else 0)
# Model choice
model_name = st.sidebar.selectbox(
"Choose Model",
["Linear Regression", "Decision Tree", "Random Forest", "XGBoost"],
)
# Hyperparameters
params = {}
if model_name == "Decision Tree":
params['max_depth'] = st.sidebar.slider("Max Depth", 1, 20, 5)
elif model_name == "Random Forest":
params['n_estimators'] = st.sidebar.slider("Number of Estimators", 10, 500, 100)
params['max_depth'] = st.sidebar.slider("Max Depth", 1, 20, 5)
elif model_name == "XGBoost":
params['n_estimators'] = st.sidebar.slider("Number of Estimators", 10, 500, 100)
params['learning_rate'] = st.sidebar.slider("Learning Rate", 0.01, 0.5, 0.1, step=0.01)
selected_metrics = st.sidebar.multiselect(
"Metrics to display",
["Mean Squared Error (MSE)", "Mean Absolute Error (MAE)", "RΒ² Score"],
default=["Mean Absolute Error (MAE)"],
)
# Prepare data
X = df2[features_selection]
y = df2[target_selection]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Instantiate model
if model_name == "Linear Regression":
model = LinearRegression()
elif model_name == "Decision Tree":
model = DecisionTreeRegressor(**params, random_state=42)
elif model_name == "Random Forest":
model = RandomForestRegressor(**params, random_state=42)
elif model_name == "XGBoost":
model = XGBRegressor(objective='reg:squarederror', **params, random_state=42)
# Train, predict and log with MLflow
with mlflow.start_run(run_name=model_name):
mlflow.log_param("model", model_name)
for k, v in params.items():
mlflow.log_param(k, v)
model.fit(X_train, y_train)
predictions = model.predict(X_test)
# Log metrics
mse = metrics.mean_squared_error(y_test, predictions)
mae = metrics.mean_absolute_error(y_test, predictions)
r2 = metrics.r2_score(y_test, predictions)
mlflow.log_metric("mse", mse)
mlflow.log_metric("mae", mae)
mlflow.log_metric("r2", r2)
# Display metrics
st.write(f"**MSE:** {mse:,.2f}")
st.write(f"**MAE:** {mae:,.2f}")
st.write(f"**RΒ² Score:** {r2:.3f}")
# Plot Actual vs Predicted
fig, ax = plt.subplots()
ax.scatter(y_test, predictions, alpha=0.5)
ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], "--r", linewidth=2)
ax.set_xlabel("Actual")
ax.set_ylabel("Predicted")
ax.set_title("Actual vs Predicted")
st.pyplot(fig)
# Explainability Page
elif page == "Explainability πŸ”":
st.subheader("06 Explainability πŸ”")
# Load built-in California dataset for SHAP
X_shap, y_shap = shap.datasets.california()
# Train default XGBoost model for explainability
model_exp = XGBRegressor(objective='reg:squarederror', n_estimators=100, random_state=42)
model_exp.fit(X_shap, y_shap)
# Create SHAP explainer and values
explainer = shap.Explainer(model_exp)
shap_values = explainer(X_shap)
# SHAP Waterfall Plot for first prediction
st.markdown("### SHAP Waterfall Plot for First Prediction")
shap.plots.waterfall(shap_values[0], show=False)
st.pyplot(plt.gcf())
# SHAP Scatter Plot for 'Latitude'
st.markdown("### SHAP Scatter Plot for 'Latitude'")
shap.plots.scatter(shap_values[:, "Latitude"], color=shap_values, show=False)
st.pyplot(plt.gcf())
# MLflow Runs Page
elif page == "MLflow Runs πŸ“ˆ":
st.subheader("05 MLflow Runs πŸ“ˆ")
# Fetch runs
runs = mlflow.search_runs(order_by=["start_time desc"])
st.dataframe(runs)
st.markdown(
"View detailed runs on DagsHub: [G.Brison/test MLflow](https://dagshub.com/G.Brison/test.mlflow)"
)