Spaces:

NYU-DS-4-Everyone
/

great

Runtime error

App Files Files Community

great / src /streamlit_app.py

gaetanbrison

Update src/streamlit_app.py

b6646e5 verified 10 months ago

raw

history blame contribute delete

7.93 kB

	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	import seaborn as sns
	import streamlit as st

	# MLflow and DagsHub initialization
	import mlflow
	import mlflow.sklearn
	import dagshub
	import shap

	# Initialize DagsHub with MLflow integration
	dagshub.init(repo_owner='G.Brison', repo_name='test', mlflow=True)

	#from ydata_profiling import ProfileReport
	#from streamlit_pandas_profiling import st_profile_report

	from sklearn.preprocessing import LabelEncoder
	from sklearn.model_selection import train_test_split
	from sklearn.linear_model import LinearRegression
	from sklearn.tree import DecisionTreeRegressor
	from sklearn.ensemble import RandomForestRegressor
	from xgboost import XGBRegressor
	from sklearn import metrics

	# Streamlit page config
	st.set_page_config(
	page_title="California Housing Dashboard 🏡",
	layout="centered",
	page_icon="🏡",
	)

	# Sidebar setup
	st.sidebar.title("California - Real Estate Agency 🏡")
	page = st.sidebar.selectbox(
	"Select Page",
	[
	"Introduction 📘",
	"Visualization 📊",
	"Automated Report 📑",
	"Prediction 🤖",
	"Explainability 🔍",
	"MLflow Runs 📈",
	],
	)

	# Display header image
	st.image("https://huggingface.co/spaces/NYU-DS-4-Everyone/great/resolve/main/src/house2.png")

	df = pd.read_csv("https://huggingface.co/spaces/NYU-DS-4-Everyone/great/resolve/main/src/housing.csv")

	# Introduction Page
	if page == "Introduction 📘":
	st.subheader("01 Introduction 📘")
	st.markdown("##### Data Preview")
	rows = st.slider("Select a number of rows to display", 5, 20, 5)
	st.dataframe(df.head(rows))

	st.markdown("##### Missing values")
	missing = df.isnull().sum()
	st.write(missing)
	if missing.sum() == 0:
	st.success("✅ No missing values found")
	else:
	st.warning("⚠️ You have missing values")

	st.markdown("##### 📈 Summary Statistics")
	if st.button("Show Describe Table"):
	st.dataframe(df.describe())

	# Visualization Page
	elif page == "Visualization 📊":
	st.subheader("02 Data Viz 📊")
	col_x = st.selectbox("Select X-axis variable", df.columns, index=0)
	col_y = st.selectbox("Select Y-axis variable", df.columns, index=1)

	tab1, tab2, tab3 = st.tabs(["Bar Chart 📊", "Line Chart 📈", "Correlation Heatmap 🔥"])

	with tab1:
	st.subheader("Bar Chart")
	st.bar_chart(df[[col_x, col_y]].sort_values(by=col_x), use_container_width=True)
	with tab2:
	st.subheader("Line Chart")
	st.line_chart(df[[col_x, col_y]].sort_values(by=col_x), use_container_width=True)
	with tab3:
	st.subheader("Correlation Matrix")
	df_numeric = df.select_dtypes(include=np.number)
	fig_corr, ax_corr = plt.subplots(figsize=(18, 14))
	sns.heatmap(df_numeric.corr(), annot=True, fmt=".2f", cmap='coolwarm', ax=ax_corr)
	st.pyplot(fig_corr)

	# Automated Report Page
	#elif page == "Automated Report 📑":
	# st.subheader("03 Automated Report 📑")
	# if st.button("Generate Report"):
	# with st.spinner("Generating report..."):
	# profile = ProfileReport(df, title="California Housing Report", explorative=True, minimal=True)
	# st_profile_report(profile)
	# export = profile.to_html()
	# st.download_button(
	# label="📥 Download full Report",
	# data=export,
	# file_name="california_housing_report.html",
	# mime='text/html',
	# )

	# Prediction Page
	elif page == "Prediction 🤖":
	st.subheader("04 Prediction with MLflow Tracking 🤖")

	# Data preprocessing
	df2 = df.dropna().copy()
	le = LabelEncoder()
	df2["ocean_proximity"] = le.fit_transform(df2["ocean_proximity"])

	# Feature/Target selection
	list_var = df2.columns.tolist()
	features_selection = st.sidebar.multiselect("Select Features (X)", list_var, default=list_var)
	target_selection = st.sidebar.selectbox("Select Target Variable (Y)", list_var, index=list_var.index('median_house_value') if 'median_house_value' in list_var else 0)

	# Model choice
	model_name = st.sidebar.selectbox(
	"Choose Model",
	["Linear Regression", "Decision Tree", "Random Forest", "XGBoost"],
	)

	# Hyperparameters
	params = {}
	if model_name == "Decision Tree":
	params['max_depth'] = st.sidebar.slider("Max Depth", 1, 20, 5)
	elif model_name == "Random Forest":
	params['n_estimators'] = st.sidebar.slider("Number of Estimators", 10, 500, 100)
	params['max_depth'] = st.sidebar.slider("Max Depth", 1, 20, 5)
	elif model_name == "XGBoost":
	params['n_estimators'] = st.sidebar.slider("Number of Estimators", 10, 500, 100)
	params['learning_rate'] = st.sidebar.slider("Learning Rate", 0.01, 0.5, 0.1, step=0.01)

	selected_metrics = st.sidebar.multiselect(
	"Metrics to display",
	["Mean Squared Error (MSE)", "Mean Absolute Error (MAE)", "R² Score"],
	default=["Mean Absolute Error (MAE)"],
	)

	# Prepare data
	X = df2[features_selection]
	y = df2[target_selection]
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

	# Instantiate model
	if model_name == "Linear Regression":
	model = LinearRegression()
	elif model_name == "Decision Tree":
	model = DecisionTreeRegressor(**params, random_state=42)
	elif model_name == "Random Forest":
	model = RandomForestRegressor(**params, random_state=42)
	elif model_name == "XGBoost":
	model = XGBRegressor(objective='reg:squarederror', **params, random_state=42)

	# Train, predict and log with MLflow
	with mlflow.start_run(run_name=model_name):
	mlflow.log_param("model", model_name)
	for k, v in params.items():
	mlflow.log_param(k, v)

	model.fit(X_train, y_train)
	predictions = model.predict(X_test)

	# Log metrics
	mse = metrics.mean_squared_error(y_test, predictions)
	mae = metrics.mean_absolute_error(y_test, predictions)
	r2 = metrics.r2_score(y_test, predictions)
	mlflow.log_metric("mse", mse)
	mlflow.log_metric("mae", mae)
	mlflow.log_metric("r2", r2)

	# Display metrics
	st.write(f"MSE: {mse:,.2f}")
	st.write(f"MAE: {mae:,.2f}")
	st.write(f"R² Score: {r2:.3f}")

	# Plot Actual vs Predicted
	fig, ax = plt.subplots()
	ax.scatter(y_test, predictions, alpha=0.5)
	ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], "--r", linewidth=2)
	ax.set_xlabel("Actual")
	ax.set_ylabel("Predicted")
	ax.set_title("Actual vs Predicted")
	st.pyplot(fig)

	# Explainability Page
	elif page == "Explainability 🔍":
	st.subheader("06 Explainability 🔍")
	# Load built-in California dataset for SHAP
	X_shap, y_shap = shap.datasets.california()
	# Train default XGBoost model for explainability
	model_exp = XGBRegressor(objective='reg:squarederror', n_estimators=100, random_state=42)
	model_exp.fit(X_shap, y_shap)

	# Create SHAP explainer and values
	explainer = shap.Explainer(model_exp)
	shap_values = explainer(X_shap)

	# SHAP Waterfall Plot for first prediction
	st.markdown("### SHAP Waterfall Plot for First Prediction")
	shap.plots.waterfall(shap_values[0], show=False)
	st.pyplot(plt.gcf())


	# SHAP Scatter Plot for 'Latitude'
	st.markdown("### SHAP Scatter Plot for 'Latitude'")
	shap.plots.scatter(shap_values[:, "Latitude"], color=shap_values, show=False)
	st.pyplot(plt.gcf())

	# MLflow Runs Page
	elif page == "MLflow Runs 📈":
	st.subheader("05 MLflow Runs 📈")
	# Fetch runs
	runs = mlflow.search_runs(order_by=["start_time desc"])
	st.dataframe(runs)
	st.markdown(
	"View detailed runs on DagsHub: [G.Brison/test MLflow](https://dagshub.com/G.Brison/test.mlflow)"
	)