Spaces:

eagle0504
/

explore_housing_data

Sleeping

File size: 12,309 Bytes

import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import plotly.express as px
import plotly.graph_objects as go

# Set page config
st.set_page_config(
    page_title="California Housing Data Explorer",
    page_icon="🏠",
    layout="wide",
    initial_sidebar_state="expanded"
)

# Custom CSS
st.markdown("""
<style>
    .main-header {
        font-size: 2.5rem;
        color: #1f77b4;
        text-align: center;
        margin-bottom: 2rem;
    }
    .metric-card {
        background-color: #f0f2f6;
        padding: 1rem;
        border-radius: 0.5rem;
        border-left: 5px solid #1f77b4;
    }
    .sidebar .sidebar-content {
        background-color: #f8f9fa;
    }
</style>
""", unsafe_allow_html=True)

# Load and cache data
@st.cache_data
def load_housing_data():
    """Load California housing dataset"""
    housing = fetch_california_housing()
    df = pd.DataFrame(housing.data, columns=housing.feature_names)
    df['target'] = housing.target
    df['price_category'] = pd.cut(df['target'], 
                                 bins=[0, 1.5, 3.0, 5.0, float('inf')], 
                                 labels=['Low', 'Medium', 'High', 'Very High'])
    return df, housing

# Train models
@st.cache_data
def train_models(X, y):
    """Train and return ML models"""
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    
    # Linear Regression
    lr_model = LinearRegression()
    lr_model.fit(X_train, y_train)
    lr_pred = lr_model.predict(X_test)
    lr_mse = mean_squared_error(y_test, lr_pred)
    lr_r2 = r2_score(y_test, lr_pred)
    
    # Random Forest
    rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
    rf_model.fit(X_train, y_train)
    rf_pred = rf_model.predict(X_test)
    rf_mse = mean_squared_error(y_test, rf_pred)
    rf_r2 = r2_score(y_test, rf_pred)
    
    return {
        'models': {'Linear Regression': lr_model, 'Random Forest': rf_model},
        'predictions': {'Linear Regression': lr_pred, 'Random Forest': rf_pred},
        'metrics': {
            'Linear Regression': {'MSE': lr_mse, 'R²': lr_r2},
            'Random Forest': {'MSE': rf_mse, 'R²': rf_r2}
        },
        'test_data': (X_test, y_test)
    }

def main():
    # Header
    st.markdown('<h1 class="main-header">🏠 California Housing Data Explorer</h1>', unsafe_allow_html=True)
    
    # Load data
    try:
        df, housing_info = load_housing_data()
    except Exception as e:
        st.error(f"Error loading data: {e}")
        return
    
    # Sidebar
    st.sidebar.title("🔧 Controls")
    st.sidebar.markdown("---")
    
    # Dataset info
    st.sidebar.subheader("📊 Dataset Info")
    st.sidebar.info(f"""
    **Samples:** {len(df):,}  
    **Features:** {len(df.columns)-2}  
    **Target:** House Value (×$100k)
    """)
    
    # Feature selection
    feature_cols = [col for col in df.columns if col not in ['target', 'price_category']]
    selected_features = st.sidebar.multiselect(
        "Select Features for Analysis",
        feature_cols,
        default=feature_cols[:4]
    )
    
    # Main content tabs
    tab1, tab2, tab3, tab4 = st.tabs(["📈 Overview", "🔍 Exploratory Analysis", "🤖 ML Models", "🎯 Predictions"])
    
    with tab1:
        st.subheader("Dataset Overview")
        
        # Key metrics
        col1, col2, col3, col4 = st.columns(4)
        with col1:
            st.metric("Total Houses", f"{len(df):,}")
        with col2:
            st.metric("Avg House Value", f"${df['target'].mean():.2f}×100k")
        with col3:
            st.metric("Max House Value", f"${df['target'].max():.2f}×100k")
        with col4:
            st.metric("Features", len(feature_cols))
        
        st.markdown("---")
        
        # Data preview
        col1, col2 = st.columns([2, 1])
        
        with col1:
            st.subheader("📋 Data Sample")
            st.dataframe(df.head(10), use_container_width=True)
        
        with col2:
            st.subheader("📊 Price Distribution")
            fig = px.histogram(df, x='target', nbins=50, 
                             title="House Value Distribution",
                             labels={'target': 'House Value (×$100k)', 'count': 'Frequency'})
            fig.update_layout(height=400)
            st.plotly_chart(fig, use_container_width=True)
        
        # Statistical summary
        st.subheader("📈 Statistical Summary")
        st.dataframe(df[feature_cols + ['target']].describe(), use_container_width=True)
    
    with tab2:
        st.subheader("Exploratory Data Analysis")
        
        if not selected_features:
            st.warning("Please select at least one feature from the sidebar.")
            return
        
        # Correlation heatmap
        st.subheader("🔥 Feature Correlation Matrix")
        corr_features = selected_features + ['target']
        corr_matrix = df[corr_features].corr()
        
        fig, ax = plt.subplots(figsize=(10, 8))
        sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, ax=ax)
        plt.title("Feature Correlation Matrix")
        st.pyplot(fig)
        plt.close()
        
        # Feature relationships
        st.subheader("📊 Feature Relationships")
        
        col1, col2 = st.columns(2)
        
        with col1:
            if len(selected_features) >= 2:
                feature_x = st.selectbox("X-axis Feature", selected_features, key="x_axis")
                feature_y = st.selectbox("Y-axis Feature", selected_features, index=1, key="y_axis")
                
                fig = px.scatter(df, x=feature_x, y=feature_y, color='target',
                               title=f"{feature_x} vs {feature_y}",
                               color_continuous_scale='viridis')
                st.plotly_chart(fig, use_container_width=True)
        
        with col2:
            if selected_features:
                selected_feature = st.selectbox("Feature for Distribution", selected_features)
                
                fig = px.box(df, y=selected_feature, x='price_category',
                           title=f"{selected_feature} by Price Category")
                st.plotly_chart(fig, use_container_width=True)
        
        # Geographic analysis (if coordinates available)
        if 'Longitude' in df.columns and 'Latitude' in df.columns:
            st.subheader("🗺️ Geographic Distribution")
            
            fig = px.scatter_mapbox(df.sample(5000), lat='Latitude', lon='Longitude', 
                                  color='target', size='target',
                                  hover_data=['AveRooms', 'AveBedrms', 'Population'],
                                  color_continuous_scale='viridis',
                                  mapbox_style='open-street-map',
                                  title='California Housing Prices by Location',
                                  height=600)
            st.plotly_chart(fig, use_container_width=True)
    
    with tab3:
        st.subheader("Machine Learning Models")
        
        if not selected_features:
            st.warning("Please select features for model training.")
            return
        
        # Train models
        X = df[selected_features]
        y = df['target']
        
        with st.spinner("Training models..."):
            results = train_models(X, y)
        
        # Model comparison
        st.subheader("📊 Model Performance")
        
        col1, col2 = st.columns(2)
        
        with col1:
            # Metrics table
            metrics_df = pd.DataFrame(results['metrics']).T
            st.dataframe(metrics_df, use_container_width=True)
        
        with col2:
            # Performance visualization
            models = list(results['metrics'].keys())
            r2_scores = [results['metrics'][model]['R²'] for model in models]
            
            fig = px.bar(x=models, y=r2_scores, 
                        title="Model Performance (R² Score)",
                        labels={'x': 'Model', 'y': 'R² Score'})
            st.plotly_chart(fig, use_container_width=True)
        
        # Feature importance (Random Forest)
        st.subheader("🎯 Feature Importance (Random Forest)")
        
        rf_model = results['models']['Random Forest']
        importance_df = pd.DataFrame({
            'Feature': selected_features,
            'Importance': rf_model.feature_importances_
        }).sort_values('Importance', ascending=False)
        
        fig = px.bar(importance_df, x='Importance', y='Feature', orientation='h',
                    title="Feature Importance in Random Forest Model")
        st.plotly_chart(fig, use_container_width=True)
        
        # Prediction vs Actual
        st.subheader("🎯 Predictions vs Actual Values")
        
        X_test, y_test = results['test_data']
        
        col1, col2 = st.columns(2)
        
        with col1:
            lr_pred = results['predictions']['Linear Regression']
            fig = px.scatter(x=y_test, y=lr_pred, 
                           title="Linear Regression: Predicted vs Actual",
                           labels={'x': 'Actual', 'y': 'Predicted'})
            fig.add_shape(type="line", x0=0, y0=0, x1=5, y1=5, 
                         line=dict(dash="dash", color="red"))
            st.plotly_chart(fig, use_container_width=True)
        
        with col2:
            rf_pred = results['predictions']['Random Forest']
            fig = px.scatter(x=y_test, y=rf_pred,
                           title="Random Forest: Predicted vs Actual",
                           labels={'x': 'Actual', 'y': 'Predicted'})
            fig.add_shape(type="line", x0=0, y0=0, x1=5, y1=5, 
                         line=dict(dash="dash", color="red"))
            st.plotly_chart(fig, use_container_width=True)
    
    with tab4:
        st.subheader("Make House Price Predictions")
        
        if not selected_features:
            st.warning("Please select features to make predictions.")
            return
        
        # Train models for prediction
        X = df[selected_features]
        y = df['target']
        results = train_models(X, y)
        
        st.write("Adjust the feature values below to predict house prices:")
        
        # Create input widgets
        input_data = {}
        cols = st.columns(min(3, len(selected_features)))
        
        for i, feature in enumerate(selected_features):
            col_idx = i % len(cols)
            with cols[col_idx]:
                min_val = float(df[feature].min())
                max_val = float(df[feature].max())
                mean_val = float(df[feature].mean())
                
                input_data[feature] = st.slider(
                    f"{feature}",
                    min_value=min_val,
                    max_value=max_val,
                    value=mean_val,
                    key=f"pred_{feature}"
                )
        
        # Make predictions
        if st.button("🎯 Predict House Price", type="primary"):
            input_df = pd.DataFrame([input_data])
            
            col1, col2 = st.columns(2)
            
            with col1:
                lr_pred = results['models']['Linear Regression'].predict(input_df)[0]
                st.success(f"**Linear Regression Prediction:**  \n${lr_pred:.2f} × 100k = ${lr_pred*100:.0f}k")
            
            with col2:
                rf_pred = results['models']['Random Forest'].predict(input_df)[0]
                st.success(f"**Random Forest Prediction:**  \n${rf_pred:.2f} × 100k = ${rf_pred*100:.0f}k")
            
            # Show input summary
            st.subheader("📋 Input Summary")
            input_summary = pd.DataFrame([input_data]).T
            input_summary.columns = ['Value']
            st.dataframe(input_summary, use_container_width=True)

if __name__ == "__main__":
    main()