import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import plotly.express as px
import plotly.graph_objects as go
# Set page config
st.set_page_config(
page_title="California Housing Data Explorer",
page_icon="π ",
layout="wide",
initial_sidebar_state="expanded"
)
# Custom CSS
st.markdown("""
""", unsafe_allow_html=True)
# Load and cache data
@st.cache_data
def load_housing_data():
"""Load California housing dataset"""
housing = fetch_california_housing()
df = pd.DataFrame(housing.data, columns=housing.feature_names)
df['target'] = housing.target
df['price_category'] = pd.cut(df['target'],
bins=[0, 1.5, 3.0, 5.0, float('inf')],
labels=['Low', 'Medium', 'High', 'Very High'])
return df, housing
# Train models
@st.cache_data
def train_models(X, y):
"""Train and return ML models"""
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_test)
lr_mse = mean_squared_error(y_test, lr_pred)
lr_r2 = r2_score(y_test, lr_pred)
# Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
rf_mse = mean_squared_error(y_test, rf_pred)
rf_r2 = r2_score(y_test, rf_pred)
return {
'models': {'Linear Regression': lr_model, 'Random Forest': rf_model},
'predictions': {'Linear Regression': lr_pred, 'Random Forest': rf_pred},
'metrics': {
'Linear Regression': {'MSE': lr_mse, 'RΒ²': lr_r2},
'Random Forest': {'MSE': rf_mse, 'RΒ²': rf_r2}
},
'test_data': (X_test, y_test)
}
def main():
# Header
st.markdown('
π California Housing Data Explorer
', unsafe_allow_html=True)
# Load data
try:
df, housing_info = load_housing_data()
except Exception as e:
st.error(f"Error loading data: {e}")
return
# Sidebar
st.sidebar.title("π§ Controls")
st.sidebar.markdown("---")
# Dataset info
st.sidebar.subheader("π Dataset Info")
st.sidebar.info(f"""
**Samples:** {len(df):,}
**Features:** {len(df.columns)-2}
**Target:** House Value (Γ$100k)
""")
# Feature selection
feature_cols = [col for col in df.columns if col not in ['target', 'price_category']]
selected_features = st.sidebar.multiselect(
"Select Features for Analysis",
feature_cols,
default=feature_cols[:4]
)
# Main content tabs
tab1, tab2, tab3, tab4 = st.tabs(["π Overview", "π Exploratory Analysis", "π€ ML Models", "π― Predictions"])
with tab1:
st.subheader("Dataset Overview")
# Key metrics
col1, col2, col3, col4 = st.columns(4)
with col1:
st.metric("Total Houses", f"{len(df):,}")
with col2:
st.metric("Avg House Value", f"${df['target'].mean():.2f}Γ100k")
with col3:
st.metric("Max House Value", f"${df['target'].max():.2f}Γ100k")
with col4:
st.metric("Features", len(feature_cols))
st.markdown("---")
# Data preview
col1, col2 = st.columns([2, 1])
with col1:
st.subheader("π Data Sample")
st.dataframe(df.head(10), use_container_width=True)
with col2:
st.subheader("π Price Distribution")
fig = px.histogram(df, x='target', nbins=50,
title="House Value Distribution",
labels={'target': 'House Value (Γ$100k)', 'count': 'Frequency'})
fig.update_layout(height=400)
st.plotly_chart(fig, use_container_width=True)
# Statistical summary
st.subheader("π Statistical Summary")
st.dataframe(df[feature_cols + ['target']].describe(), use_container_width=True)
with tab2:
st.subheader("Exploratory Data Analysis")
if not selected_features:
st.warning("Please select at least one feature from the sidebar.")
return
# Correlation heatmap
st.subheader("π₯ Feature Correlation Matrix")
corr_features = selected_features + ['target']
corr_matrix = df[corr_features].corr()
fig, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, ax=ax)
plt.title("Feature Correlation Matrix")
st.pyplot(fig)
plt.close()
# Feature relationships
st.subheader("π Feature Relationships")
col1, col2 = st.columns(2)
with col1:
if len(selected_features) >= 2:
feature_x = st.selectbox("X-axis Feature", selected_features, key="x_axis")
feature_y = st.selectbox("Y-axis Feature", selected_features, index=1, key="y_axis")
fig = px.scatter(df, x=feature_x, y=feature_y, color='target',
title=f"{feature_x} vs {feature_y}",
color_continuous_scale='viridis')
st.plotly_chart(fig, use_container_width=True)
with col2:
if selected_features:
selected_feature = st.selectbox("Feature for Distribution", selected_features)
fig = px.box(df, y=selected_feature, x='price_category',
title=f"{selected_feature} by Price Category")
st.plotly_chart(fig, use_container_width=True)
# Geographic analysis (if coordinates available)
if 'Longitude' in df.columns and 'Latitude' in df.columns:
st.subheader("πΊοΈ Geographic Distribution")
fig = px.scatter_mapbox(df.sample(5000), lat='Latitude', lon='Longitude',
color='target', size='target',
hover_data=['AveRooms', 'AveBedrms', 'Population'],
color_continuous_scale='viridis',
mapbox_style='open-street-map',
title='California Housing Prices by Location',
height=600)
st.plotly_chart(fig, use_container_width=True)
with tab3:
st.subheader("Machine Learning Models")
if not selected_features:
st.warning("Please select features for model training.")
return
# Train models
X = df[selected_features]
y = df['target']
with st.spinner("Training models..."):
results = train_models(X, y)
# Model comparison
st.subheader("π Model Performance")
col1, col2 = st.columns(2)
with col1:
# Metrics table
metrics_df = pd.DataFrame(results['metrics']).T
st.dataframe(metrics_df, use_container_width=True)
with col2:
# Performance visualization
models = list(results['metrics'].keys())
r2_scores = [results['metrics'][model]['RΒ²'] for model in models]
fig = px.bar(x=models, y=r2_scores,
title="Model Performance (RΒ² Score)",
labels={'x': 'Model', 'y': 'RΒ² Score'})
st.plotly_chart(fig, use_container_width=True)
# Feature importance (Random Forest)
st.subheader("π― Feature Importance (Random Forest)")
rf_model = results['models']['Random Forest']
importance_df = pd.DataFrame({
'Feature': selected_features,
'Importance': rf_model.feature_importances_
}).sort_values('Importance', ascending=False)
fig = px.bar(importance_df, x='Importance', y='Feature', orientation='h',
title="Feature Importance in Random Forest Model")
st.plotly_chart(fig, use_container_width=True)
# Prediction vs Actual
st.subheader("π― Predictions vs Actual Values")
X_test, y_test = results['test_data']
col1, col2 = st.columns(2)
with col1:
lr_pred = results['predictions']['Linear Regression']
fig = px.scatter(x=y_test, y=lr_pred,
title="Linear Regression: Predicted vs Actual",
labels={'x': 'Actual', 'y': 'Predicted'})
fig.add_shape(type="line", x0=0, y0=0, x1=5, y1=5,
line=dict(dash="dash", color="red"))
st.plotly_chart(fig, use_container_width=True)
with col2:
rf_pred = results['predictions']['Random Forest']
fig = px.scatter(x=y_test, y=rf_pred,
title="Random Forest: Predicted vs Actual",
labels={'x': 'Actual', 'y': 'Predicted'})
fig.add_shape(type="line", x0=0, y0=0, x1=5, y1=5,
line=dict(dash="dash", color="red"))
st.plotly_chart(fig, use_container_width=True)
with tab4:
st.subheader("Make House Price Predictions")
if not selected_features:
st.warning("Please select features to make predictions.")
return
# Train models for prediction
X = df[selected_features]
y = df['target']
results = train_models(X, y)
st.write("Adjust the feature values below to predict house prices:")
# Create input widgets
input_data = {}
cols = st.columns(min(3, len(selected_features)))
for i, feature in enumerate(selected_features):
col_idx = i % len(cols)
with cols[col_idx]:
min_val = float(df[feature].min())
max_val = float(df[feature].max())
mean_val = float(df[feature].mean())
input_data[feature] = st.slider(
f"{feature}",
min_value=min_val,
max_value=max_val,
value=mean_val,
key=f"pred_{feature}"
)
# Make predictions
if st.button("π― Predict House Price", type="primary"):
input_df = pd.DataFrame([input_data])
col1, col2 = st.columns(2)
with col1:
lr_pred = results['models']['Linear Regression'].predict(input_df)[0]
st.success(f"**Linear Regression Prediction:** \n${lr_pred:.2f} Γ 100k = ${lr_pred*100:.0f}k")
with col2:
rf_pred = results['models']['Random Forest'].predict(input_df)[0]
st.success(f"**Random Forest Prediction:** \n${rf_pred:.2f} Γ 100k = ${rf_pred*100:.0f}k")
# Show input summary
st.subheader("π Input Summary")
input_summary = pd.DataFrame([input_data]).T
input_summary.columns = ['Value']
st.dataframe(input_summary, use_container_width=True)
if __name__ == "__main__":
main()