Spaces:
Runtime error
Runtime error
| import pandas as pd | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| import streamlit as st | |
| # MLflow and DagsHub initialization | |
| import mlflow | |
| import mlflow.sklearn | |
| import dagshub | |
| import shap | |
| # Initialize DagsHub with MLflow integration | |
| dagshub.init(repo_owner='G.Brison', repo_name='test', mlflow=True) | |
| #from ydata_profiling import ProfileReport | |
| #from streamlit_pandas_profiling import st_profile_report | |
| from sklearn.preprocessing import LabelEncoder | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.linear_model import LinearRegression | |
| from sklearn.tree import DecisionTreeRegressor | |
| from sklearn.ensemble import RandomForestRegressor | |
| from xgboost import XGBRegressor | |
| from sklearn import metrics | |
| # Streamlit page config | |
| st.set_page_config( | |
| page_title="California Housing Dashboard π‘", | |
| layout="centered", | |
| page_icon="π‘", | |
| ) | |
| # Sidebar setup | |
| st.sidebar.title("California - Real Estate Agency π‘") | |
| page = st.sidebar.selectbox( | |
| "Select Page", | |
| [ | |
| "Introduction π", | |
| "Visualization π", | |
| "Automated Report π", | |
| "Prediction π€", | |
| "Explainability π", | |
| "MLflow Runs π", | |
| ], | |
| ) | |
| # Display header image | |
| st.image("https://huggingface.co/spaces/NYU-DS-4-Everyone/great/resolve/main/src/house2.png") | |
| df = pd.read_csv("https://huggingface.co/spaces/NYU-DS-4-Everyone/great/resolve/main/src/housing.csv") | |
| # Introduction Page | |
| if page == "Introduction π": | |
| st.subheader("01 Introduction π") | |
| st.markdown("##### Data Preview") | |
| rows = st.slider("Select a number of rows to display", 5, 20, 5) | |
| st.dataframe(df.head(rows)) | |
| st.markdown("##### Missing values") | |
| missing = df.isnull().sum() | |
| st.write(missing) | |
| if missing.sum() == 0: | |
| st.success("β No missing values found") | |
| else: | |
| st.warning("β οΈ You have missing values") | |
| st.markdown("##### π Summary Statistics") | |
| if st.button("Show Describe Table"): | |
| st.dataframe(df.describe()) | |
| # Visualization Page | |
| elif page == "Visualization π": | |
| st.subheader("02 Data Viz π") | |
| col_x = st.selectbox("Select X-axis variable", df.columns, index=0) | |
| col_y = st.selectbox("Select Y-axis variable", df.columns, index=1) | |
| tab1, tab2, tab3 = st.tabs(["Bar Chart π", "Line Chart π", "Correlation Heatmap π₯"]) | |
| with tab1: | |
| st.subheader("Bar Chart") | |
| st.bar_chart(df[[col_x, col_y]].sort_values(by=col_x), use_container_width=True) | |
| with tab2: | |
| st.subheader("Line Chart") | |
| st.line_chart(df[[col_x, col_y]].sort_values(by=col_x), use_container_width=True) | |
| with tab3: | |
| st.subheader("Correlation Matrix") | |
| df_numeric = df.select_dtypes(include=np.number) | |
| fig_corr, ax_corr = plt.subplots(figsize=(18, 14)) | |
| sns.heatmap(df_numeric.corr(), annot=True, fmt=".2f", cmap='coolwarm', ax=ax_corr) | |
| st.pyplot(fig_corr) | |
| # Automated Report Page | |
| #elif page == "Automated Report π": | |
| # st.subheader("03 Automated Report π") | |
| # if st.button("Generate Report"): | |
| # with st.spinner("Generating report..."): | |
| # profile = ProfileReport(df, title="California Housing Report", explorative=True, minimal=True) | |
| # st_profile_report(profile) | |
| # export = profile.to_html() | |
| # st.download_button( | |
| # label="π₯ Download full Report", | |
| # data=export, | |
| # file_name="california_housing_report.html", | |
| # mime='text/html', | |
| # ) | |
| # Prediction Page | |
| elif page == "Prediction π€": | |
| st.subheader("04 Prediction with MLflow Tracking π€") | |
| # Data preprocessing | |
| df2 = df.dropna().copy() | |
| le = LabelEncoder() | |
| df2["ocean_proximity"] = le.fit_transform(df2["ocean_proximity"]) | |
| # Feature/Target selection | |
| list_var = df2.columns.tolist() | |
| features_selection = st.sidebar.multiselect("Select Features (X)", list_var, default=list_var) | |
| target_selection = st.sidebar.selectbox("Select Target Variable (Y)", list_var, index=list_var.index('median_house_value') if 'median_house_value' in list_var else 0) | |
| # Model choice | |
| model_name = st.sidebar.selectbox( | |
| "Choose Model", | |
| ["Linear Regression", "Decision Tree", "Random Forest", "XGBoost"], | |
| ) | |
| # Hyperparameters | |
| params = {} | |
| if model_name == "Decision Tree": | |
| params['max_depth'] = st.sidebar.slider("Max Depth", 1, 20, 5) | |
| elif model_name == "Random Forest": | |
| params['n_estimators'] = st.sidebar.slider("Number of Estimators", 10, 500, 100) | |
| params['max_depth'] = st.sidebar.slider("Max Depth", 1, 20, 5) | |
| elif model_name == "XGBoost": | |
| params['n_estimators'] = st.sidebar.slider("Number of Estimators", 10, 500, 100) | |
| params['learning_rate'] = st.sidebar.slider("Learning Rate", 0.01, 0.5, 0.1, step=0.01) | |
| selected_metrics = st.sidebar.multiselect( | |
| "Metrics to display", | |
| ["Mean Squared Error (MSE)", "Mean Absolute Error (MAE)", "RΒ² Score"], | |
| default=["Mean Absolute Error (MAE)"], | |
| ) | |
| # Prepare data | |
| X = df2[features_selection] | |
| y = df2[target_selection] | |
| X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) | |
| # Instantiate model | |
| if model_name == "Linear Regression": | |
| model = LinearRegression() | |
| elif model_name == "Decision Tree": | |
| model = DecisionTreeRegressor(**params, random_state=42) | |
| elif model_name == "Random Forest": | |
| model = RandomForestRegressor(**params, random_state=42) | |
| elif model_name == "XGBoost": | |
| model = XGBRegressor(objective='reg:squarederror', **params, random_state=42) | |
| # Train, predict and log with MLflow | |
| with mlflow.start_run(run_name=model_name): | |
| mlflow.log_param("model", model_name) | |
| for k, v in params.items(): | |
| mlflow.log_param(k, v) | |
| model.fit(X_train, y_train) | |
| predictions = model.predict(X_test) | |
| # Log metrics | |
| mse = metrics.mean_squared_error(y_test, predictions) | |
| mae = metrics.mean_absolute_error(y_test, predictions) | |
| r2 = metrics.r2_score(y_test, predictions) | |
| mlflow.log_metric("mse", mse) | |
| mlflow.log_metric("mae", mae) | |
| mlflow.log_metric("r2", r2) | |
| # Display metrics | |
| st.write(f"**MSE:** {mse:,.2f}") | |
| st.write(f"**MAE:** {mae:,.2f}") | |
| st.write(f"**RΒ² Score:** {r2:.3f}") | |
| # Plot Actual vs Predicted | |
| fig, ax = plt.subplots() | |
| ax.scatter(y_test, predictions, alpha=0.5) | |
| ax.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], "--r", linewidth=2) | |
| ax.set_xlabel("Actual") | |
| ax.set_ylabel("Predicted") | |
| ax.set_title("Actual vs Predicted") | |
| st.pyplot(fig) | |
| # Explainability Page | |
| elif page == "Explainability π": | |
| st.subheader("06 Explainability π") | |
| # Load built-in California dataset for SHAP | |
| X_shap, y_shap = shap.datasets.california() | |
| # Train default XGBoost model for explainability | |
| model_exp = XGBRegressor(objective='reg:squarederror', n_estimators=100, random_state=42) | |
| model_exp.fit(X_shap, y_shap) | |
| # Create SHAP explainer and values | |
| explainer = shap.Explainer(model_exp) | |
| shap_values = explainer(X_shap) | |
| # SHAP Waterfall Plot for first prediction | |
| st.markdown("### SHAP Waterfall Plot for First Prediction") | |
| shap.plots.waterfall(shap_values[0], show=False) | |
| st.pyplot(plt.gcf()) | |
| # SHAP Scatter Plot for 'Latitude' | |
| st.markdown("### SHAP Scatter Plot for 'Latitude'") | |
| shap.plots.scatter(shap_values[:, "Latitude"], color=shap_values, show=False) | |
| st.pyplot(plt.gcf()) | |
| # MLflow Runs Page | |
| elif page == "MLflow Runs π": | |
| st.subheader("05 MLflow Runs π") | |
| # Fetch runs | |
| runs = mlflow.search_runs(order_by=["start_time desc"]) | |
| st.dataframe(runs) | |
| st.markdown( | |
| "View detailed runs on DagsHub: [G.Brison/test MLflow](https://dagshub.com/G.Brison/test.mlflow)" | |
| ) | |