Spaces:
Sleeping
Sleeping
| import streamlit as st | |
| import pandas as pd | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| import seaborn as sns | |
| from sklearn.datasets import fetch_california_housing | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.linear_model import LinearRegression | |
| from sklearn.ensemble import RandomForestRegressor | |
| from sklearn.metrics import mean_squared_error, r2_score | |
| import plotly.express as px | |
| import plotly.graph_objects as go | |
| # Set page config | |
| st.set_page_config( | |
| page_title="California Housing Data Explorer", | |
| page_icon="π ", | |
| layout="wide", | |
| initial_sidebar_state="expanded" | |
| ) | |
| # Custom CSS | |
| st.markdown(""" | |
| <style> | |
| .main-header { | |
| font-size: 2.5rem; | |
| color: #1f77b4; | |
| text-align: center; | |
| margin-bottom: 2rem; | |
| } | |
| .metric-card { | |
| background-color: #f0f2f6; | |
| padding: 1rem; | |
| border-radius: 0.5rem; | |
| border-left: 5px solid #1f77b4; | |
| } | |
| .sidebar .sidebar-content { | |
| background-color: #f8f9fa; | |
| } | |
| </style> | |
| """, unsafe_allow_html=True) | |
| # Load and cache data | |
| def load_housing_data(): | |
| """Load California housing dataset""" | |
| housing = fetch_california_housing() | |
| df = pd.DataFrame(housing.data, columns=housing.feature_names) | |
| df['target'] = housing.target | |
| df['price_category'] = pd.cut(df['target'], | |
| bins=[0, 1.5, 3.0, 5.0, float('inf')], | |
| labels=['Low', 'Medium', 'High', 'Very High']) | |
| return df, housing | |
| # Train models | |
| def train_models(X, y): | |
| """Train and return ML models""" | |
| X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) | |
| # Linear Regression | |
| lr_model = LinearRegression() | |
| lr_model.fit(X_train, y_train) | |
| lr_pred = lr_model.predict(X_test) | |
| lr_mse = mean_squared_error(y_test, lr_pred) | |
| lr_r2 = r2_score(y_test, lr_pred) | |
| # Random Forest | |
| rf_model = RandomForestRegressor(n_estimators=100, random_state=42) | |
| rf_model.fit(X_train, y_train) | |
| rf_pred = rf_model.predict(X_test) | |
| rf_mse = mean_squared_error(y_test, rf_pred) | |
| rf_r2 = r2_score(y_test, rf_pred) | |
| return { | |
| 'models': {'Linear Regression': lr_model, 'Random Forest': rf_model}, | |
| 'predictions': {'Linear Regression': lr_pred, 'Random Forest': rf_pred}, | |
| 'metrics': { | |
| 'Linear Regression': {'MSE': lr_mse, 'RΒ²': lr_r2}, | |
| 'Random Forest': {'MSE': rf_mse, 'RΒ²': rf_r2} | |
| }, | |
| 'test_data': (X_test, y_test) | |
| } | |
| def main(): | |
| # Header | |
| st.markdown('<h1 class="main-header">π California Housing Data Explorer</h1>', unsafe_allow_html=True) | |
| # Load data | |
| try: | |
| df, housing_info = load_housing_data() | |
| except Exception as e: | |
| st.error(f"Error loading data: {e}") | |
| return | |
| # Sidebar | |
| st.sidebar.title("π§ Controls") | |
| st.sidebar.markdown("---") | |
| # Dataset info | |
| st.sidebar.subheader("π Dataset Info") | |
| st.sidebar.info(f""" | |
| **Samples:** {len(df):,} | |
| **Features:** {len(df.columns)-2} | |
| **Target:** House Value (Γ$100k) | |
| """) | |
| # Feature selection | |
| feature_cols = [col for col in df.columns if col not in ['target', 'price_category']] | |
| selected_features = st.sidebar.multiselect( | |
| "Select Features for Analysis", | |
| feature_cols, | |
| default=feature_cols[:4] | |
| ) | |
| # Main content tabs | |
| tab1, tab2, tab3, tab4 = st.tabs(["π Overview", "π Exploratory Analysis", "π€ ML Models", "π― Predictions"]) | |
| with tab1: | |
| st.subheader("Dataset Overview") | |
| # Key metrics | |
| col1, col2, col3, col4 = st.columns(4) | |
| with col1: | |
| st.metric("Total Houses", f"{len(df):,}") | |
| with col2: | |
| st.metric("Avg House Value", f"${df['target'].mean():.2f}Γ100k") | |
| with col3: | |
| st.metric("Max House Value", f"${df['target'].max():.2f}Γ100k") | |
| with col4: | |
| st.metric("Features", len(feature_cols)) | |
| st.markdown("---") | |
| # Data preview | |
| col1, col2 = st.columns([2, 1]) | |
| with col1: | |
| st.subheader("π Data Sample") | |
| st.dataframe(df.head(10), use_container_width=True) | |
| with col2: | |
| st.subheader("π Price Distribution") | |
| fig = px.histogram(df, x='target', nbins=50, | |
| title="House Value Distribution", | |
| labels={'target': 'House Value (Γ$100k)', 'count': 'Frequency'}) | |
| fig.update_layout(height=400) | |
| st.plotly_chart(fig, use_container_width=True) | |
| # Statistical summary | |
| st.subheader("π Statistical Summary") | |
| st.dataframe(df[feature_cols + ['target']].describe(), use_container_width=True) | |
| with tab2: | |
| st.subheader("Exploratory Data Analysis") | |
| if not selected_features: | |
| st.warning("Please select at least one feature from the sidebar.") | |
| return | |
| # Correlation heatmap | |
| st.subheader("π₯ Feature Correlation Matrix") | |
| corr_features = selected_features + ['target'] | |
| corr_matrix = df[corr_features].corr() | |
| fig, ax = plt.subplots(figsize=(10, 8)) | |
| sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, ax=ax) | |
| plt.title("Feature Correlation Matrix") | |
| st.pyplot(fig) | |
| plt.close() | |
| # Feature relationships | |
| st.subheader("π Feature Relationships") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| if len(selected_features) >= 2: | |
| feature_x = st.selectbox("X-axis Feature", selected_features, key="x_axis") | |
| feature_y = st.selectbox("Y-axis Feature", selected_features, index=1, key="y_axis") | |
| fig = px.scatter(df, x=feature_x, y=feature_y, color='target', | |
| title=f"{feature_x} vs {feature_y}", | |
| color_continuous_scale='viridis') | |
| st.plotly_chart(fig, use_container_width=True) | |
| with col2: | |
| if selected_features: | |
| selected_feature = st.selectbox("Feature for Distribution", selected_features) | |
| fig = px.box(df, y=selected_feature, x='price_category', | |
| title=f"{selected_feature} by Price Category") | |
| st.plotly_chart(fig, use_container_width=True) | |
| # Geographic analysis (if coordinates available) | |
| if 'Longitude' in df.columns and 'Latitude' in df.columns: | |
| st.subheader("πΊοΈ Geographic Distribution") | |
| fig = px.scatter_mapbox(df.sample(5000), lat='Latitude', lon='Longitude', | |
| color='target', size='target', | |
| hover_data=['AveRooms', 'AveBedrms', 'Population'], | |
| color_continuous_scale='viridis', | |
| mapbox_style='open-street-map', | |
| title='California Housing Prices by Location', | |
| height=600) | |
| st.plotly_chart(fig, use_container_width=True) | |
| with tab3: | |
| st.subheader("Machine Learning Models") | |
| if not selected_features: | |
| st.warning("Please select features for model training.") | |
| return | |
| # Train models | |
| X = df[selected_features] | |
| y = df['target'] | |
| with st.spinner("Training models..."): | |
| results = train_models(X, y) | |
| # Model comparison | |
| st.subheader("π Model Performance") | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| # Metrics table | |
| metrics_df = pd.DataFrame(results['metrics']).T | |
| st.dataframe(metrics_df, use_container_width=True) | |
| with col2: | |
| # Performance visualization | |
| models = list(results['metrics'].keys()) | |
| r2_scores = [results['metrics'][model]['RΒ²'] for model in models] | |
| fig = px.bar(x=models, y=r2_scores, | |
| title="Model Performance (RΒ² Score)", | |
| labels={'x': 'Model', 'y': 'RΒ² Score'}) | |
| st.plotly_chart(fig, use_container_width=True) | |
| # Feature importance (Random Forest) | |
| st.subheader("π― Feature Importance (Random Forest)") | |
| rf_model = results['models']['Random Forest'] | |
| importance_df = pd.DataFrame({ | |
| 'Feature': selected_features, | |
| 'Importance': rf_model.feature_importances_ | |
| }).sort_values('Importance', ascending=False) | |
| fig = px.bar(importance_df, x='Importance', y='Feature', orientation='h', | |
| title="Feature Importance in Random Forest Model") | |
| st.plotly_chart(fig, use_container_width=True) | |
| # Prediction vs Actual | |
| st.subheader("π― Predictions vs Actual Values") | |
| X_test, y_test = results['test_data'] | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| lr_pred = results['predictions']['Linear Regression'] | |
| fig = px.scatter(x=y_test, y=lr_pred, | |
| title="Linear Regression: Predicted vs Actual", | |
| labels={'x': 'Actual', 'y': 'Predicted'}) | |
| fig.add_shape(type="line", x0=0, y0=0, x1=5, y1=5, | |
| line=dict(dash="dash", color="red")) | |
| st.plotly_chart(fig, use_container_width=True) | |
| with col2: | |
| rf_pred = results['predictions']['Random Forest'] | |
| fig = px.scatter(x=y_test, y=rf_pred, | |
| title="Random Forest: Predicted vs Actual", | |
| labels={'x': 'Actual', 'y': 'Predicted'}) | |
| fig.add_shape(type="line", x0=0, y0=0, x1=5, y1=5, | |
| line=dict(dash="dash", color="red")) | |
| st.plotly_chart(fig, use_container_width=True) | |
| with tab4: | |
| st.subheader("Make House Price Predictions") | |
| if not selected_features: | |
| st.warning("Please select features to make predictions.") | |
| return | |
| # Train models for prediction | |
| X = df[selected_features] | |
| y = df['target'] | |
| results = train_models(X, y) | |
| st.write("Adjust the feature values below to predict house prices:") | |
| # Create input widgets | |
| input_data = {} | |
| cols = st.columns(min(3, len(selected_features))) | |
| for i, feature in enumerate(selected_features): | |
| col_idx = i % len(cols) | |
| with cols[col_idx]: | |
| min_val = float(df[feature].min()) | |
| max_val = float(df[feature].max()) | |
| mean_val = float(df[feature].mean()) | |
| input_data[feature] = st.slider( | |
| f"{feature}", | |
| min_value=min_val, | |
| max_value=max_val, | |
| value=mean_val, | |
| key=f"pred_{feature}" | |
| ) | |
| # Make predictions | |
| if st.button("π― Predict House Price", type="primary"): | |
| input_df = pd.DataFrame([input_data]) | |
| col1, col2 = st.columns(2) | |
| with col1: | |
| lr_pred = results['models']['Linear Regression'].predict(input_df)[0] | |
| st.success(f"**Linear Regression Prediction:** \n${lr_pred:.2f} Γ 100k = ${lr_pred*100:.0f}k") | |
| with col2: | |
| rf_pred = results['models']['Random Forest'].predict(input_df)[0] | |
| st.success(f"**Random Forest Prediction:** \n${rf_pred:.2f} Γ 100k = ${rf_pred*100:.0f}k") | |
| # Show input summary | |
| st.subheader("π Input Summary") | |
| input_summary = pd.DataFrame([input_data]).T | |
| input_summary.columns = ['Value'] | |
| st.dataframe(input_summary, use_container_width=True) | |
| if __name__ == "__main__": | |
| main() |