import streamlit as st import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.datasets import fetch_california_housing from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import mean_squared_error, r2_score import plotly.express as px import plotly.graph_objects as go # Set page config st.set_page_config( page_title="California Housing Data Explorer", page_icon="🏠", layout="wide", initial_sidebar_state="expanded" ) # Custom CSS st.markdown(""" """, unsafe_allow_html=True) # Load and cache data @st.cache_data def load_housing_data(): """Load California housing dataset""" housing = fetch_california_housing() df = pd.DataFrame(housing.data, columns=housing.feature_names) df['target'] = housing.target df['price_category'] = pd.cut(df['target'], bins=[0, 1.5, 3.0, 5.0, float('inf')], labels=['Low', 'Medium', 'High', 'Very High']) return df, housing # Train models @st.cache_data def train_models(X, y): """Train and return ML models""" X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) # Linear Regression lr_model = LinearRegression() lr_model.fit(X_train, y_train) lr_pred = lr_model.predict(X_test) lr_mse = mean_squared_error(y_test, lr_pred) lr_r2 = r2_score(y_test, lr_pred) # Random Forest rf_model = RandomForestRegressor(n_estimators=100, random_state=42) rf_model.fit(X_train, y_train) rf_pred = rf_model.predict(X_test) rf_mse = mean_squared_error(y_test, rf_pred) rf_r2 = r2_score(y_test, rf_pred) return { 'models': {'Linear Regression': lr_model, 'Random Forest': rf_model}, 'predictions': {'Linear Regression': lr_pred, 'Random Forest': rf_pred}, 'metrics': { 'Linear Regression': {'MSE': lr_mse, 'R²': lr_r2}, 'Random Forest': {'MSE': rf_mse, 'R²': rf_r2} }, 'test_data': (X_test, y_test) } def main(): # Header st.markdown('

🏠 California Housing Data Explorer

', unsafe_allow_html=True) # Load data try: df, housing_info = load_housing_data() except Exception as e: st.error(f"Error loading data: {e}") return # Sidebar st.sidebar.title("🔧 Controls") st.sidebar.markdown("---") # Dataset info st.sidebar.subheader("📊 Dataset Info") st.sidebar.info(f""" **Samples:** {len(df):,} **Features:** {len(df.columns)-2} **Target:** House Value (×$100k) """) # Feature selection feature_cols = [col for col in df.columns if col not in ['target', 'price_category']] selected_features = st.sidebar.multiselect( "Select Features for Analysis", feature_cols, default=feature_cols[:4] ) # Main content tabs tab1, tab2, tab3, tab4 = st.tabs(["📈 Overview", "🔍 Exploratory Analysis", "🤖 ML Models", "🎯 Predictions"]) with tab1: st.subheader("Dataset Overview") # Key metrics col1, col2, col3, col4 = st.columns(4) with col1: st.metric("Total Houses", f"{len(df):,}") with col2: st.metric("Avg House Value", f"${df['target'].mean():.2f}×100k") with col3: st.metric("Max House Value", f"${df['target'].max():.2f}×100k") with col4: st.metric("Features", len(feature_cols)) st.markdown("---") # Data preview col1, col2 = st.columns([2, 1]) with col1: st.subheader("📋 Data Sample") st.dataframe(df.head(10), use_container_width=True) with col2: st.subheader("📊 Price Distribution") fig = px.histogram(df, x='target', nbins=50, title="House Value Distribution", labels={'target': 'House Value (×$100k)', 'count': 'Frequency'}) fig.update_layout(height=400) st.plotly_chart(fig, use_container_width=True) # Statistical summary st.subheader("📈 Statistical Summary") st.dataframe(df[feature_cols + ['target']].describe(), use_container_width=True) with tab2: st.subheader("Exploratory Data Analysis") if not selected_features: st.warning("Please select at least one feature from the sidebar.") return # Correlation heatmap st.subheader("🔥 Feature Correlation Matrix") corr_features = selected_features + ['target'] corr_matrix = df[corr_features].corr() fig, ax = plt.subplots(figsize=(10, 8)) sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, ax=ax) plt.title("Feature Correlation Matrix") st.pyplot(fig) plt.close() # Feature relationships st.subheader("📊 Feature Relationships") col1, col2 = st.columns(2) with col1: if len(selected_features) >= 2: feature_x = st.selectbox("X-axis Feature", selected_features, key="x_axis") feature_y = st.selectbox("Y-axis Feature", selected_features, index=1, key="y_axis") fig = px.scatter(df, x=feature_x, y=feature_y, color='target', title=f"{feature_x} vs {feature_y}", color_continuous_scale='viridis') st.plotly_chart(fig, use_container_width=True) with col2: if selected_features: selected_feature = st.selectbox("Feature for Distribution", selected_features) fig = px.box(df, y=selected_feature, x='price_category', title=f"{selected_feature} by Price Category") st.plotly_chart(fig, use_container_width=True) # Geographic analysis (if coordinates available) if 'Longitude' in df.columns and 'Latitude' in df.columns: st.subheader("🗺️ Geographic Distribution") fig = px.scatter_mapbox(df.sample(5000), lat='Latitude', lon='Longitude', color='target', size='target', hover_data=['AveRooms', 'AveBedrms', 'Population'], color_continuous_scale='viridis', mapbox_style='open-street-map', title='California Housing Prices by Location', height=600) st.plotly_chart(fig, use_container_width=True) with tab3: st.subheader("Machine Learning Models") if not selected_features: st.warning("Please select features for model training.") return # Train models X = df[selected_features] y = df['target'] with st.spinner("Training models..."): results = train_models(X, y) # Model comparison st.subheader("📊 Model Performance") col1, col2 = st.columns(2) with col1: # Metrics table metrics_df = pd.DataFrame(results['metrics']).T st.dataframe(metrics_df, use_container_width=True) with col2: # Performance visualization models = list(results['metrics'].keys()) r2_scores = [results['metrics'][model]['R²'] for model in models] fig = px.bar(x=models, y=r2_scores, title="Model Performance (R² Score)", labels={'x': 'Model', 'y': 'R² Score'}) st.plotly_chart(fig, use_container_width=True) # Feature importance (Random Forest) st.subheader("🎯 Feature Importance (Random Forest)") rf_model = results['models']['Random Forest'] importance_df = pd.DataFrame({ 'Feature': selected_features, 'Importance': rf_model.feature_importances_ }).sort_values('Importance', ascending=False) fig = px.bar(importance_df, x='Importance', y='Feature', orientation='h', title="Feature Importance in Random Forest Model") st.plotly_chart(fig, use_container_width=True) # Prediction vs Actual st.subheader("🎯 Predictions vs Actual Values") X_test, y_test = results['test_data'] col1, col2 = st.columns(2) with col1: lr_pred = results['predictions']['Linear Regression'] fig = px.scatter(x=y_test, y=lr_pred, title="Linear Regression: Predicted vs Actual", labels={'x': 'Actual', 'y': 'Predicted'}) fig.add_shape(type="line", x0=0, y0=0, x1=5, y1=5, line=dict(dash="dash", color="red")) st.plotly_chart(fig, use_container_width=True) with col2: rf_pred = results['predictions']['Random Forest'] fig = px.scatter(x=y_test, y=rf_pred, title="Random Forest: Predicted vs Actual", labels={'x': 'Actual', 'y': 'Predicted'}) fig.add_shape(type="line", x0=0, y0=0, x1=5, y1=5, line=dict(dash="dash", color="red")) st.plotly_chart(fig, use_container_width=True) with tab4: st.subheader("Make House Price Predictions") if not selected_features: st.warning("Please select features to make predictions.") return # Train models for prediction X = df[selected_features] y = df['target'] results = train_models(X, y) st.write("Adjust the feature values below to predict house prices:") # Create input widgets input_data = {} cols = st.columns(min(3, len(selected_features))) for i, feature in enumerate(selected_features): col_idx = i % len(cols) with cols[col_idx]: min_val = float(df[feature].min()) max_val = float(df[feature].max()) mean_val = float(df[feature].mean()) input_data[feature] = st.slider( f"{feature}", min_value=min_val, max_value=max_val, value=mean_val, key=f"pred_{feature}" ) # Make predictions if st.button("🎯 Predict House Price", type="primary"): input_df = pd.DataFrame([input_data]) col1, col2 = st.columns(2) with col1: lr_pred = results['models']['Linear Regression'].predict(input_df)[0] st.success(f"**Linear Regression Prediction:** \n${lr_pred:.2f} × 100k = ${lr_pred*100:.0f}k") with col2: rf_pred = results['models']['Random Forest'].predict(input_df)[0] st.success(f"**Random Forest Prediction:** \n${rf_pred:.2f} × 100k = ${rf_pred*100:.0f}k") # Show input summary st.subheader("📋 Input Summary") input_summary = pd.DataFrame([input_data]).T input_summary.columns = ['Value'] st.dataframe(input_summary, use_container_width=True) if __name__ == "__main__": main()