UmaKumpatla's picture
Update app.py
988d6d6 verified
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns
# Page setup
st.set_page_config(page_title="Explore Linear Regression", layout="wide")
st.title("Linear Regression")
# Intro
st.markdown("""
## πŸ“˜ What is Linear Regression?
Linear Regression models the relationship between a continuous outcome and one or more input variables (features).
**Equation:**
\[
\\hat{y} = w_1x_1 + w_2x_2 + ... + w_nx_n + b
\]
It tries to find the line (or hyperplane) that best fits the data.
---
""")
# Load dataset from OpenML (Boston housing)
@st.cache_data
def load_data():
boston = fetch_openml(name="boston", version=1, as_frame=True)
df = boston.frame
return df
df = load_data()
st.subheader("🏠 Dataset: Boston Housing Prices")
st.markdown("This dataset contains information about houses in Boston suburbs and aims to predict the **median value of owner-occupied homes**.")
st.dataframe(df.head(), use_container_width=True)
# Feature selection
target_col = "MEDV"
X = df.drop(columns=target_col)
y = df[target_col]
# Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
# Model training
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
# Evaluation
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
st.success(f"πŸ“ Model Performance: RΒ² = {r2:.2f}, MSE = {mse:.2f}")
# Feature coefficients
st.markdown("### πŸ” Coefficients (Feature Importance)")
coef_df = pd.DataFrame({
"Feature": X.columns,
"Coefficient": model.coef_
}).sort_values(by="Coefficient", key=abs, ascending=False)
st.dataframe(coef_df, use_container_width=True)
# Actual vs Predicted Plot
st.markdown("### πŸ“ˆ Actual vs Predicted Home Prices")
fig1, ax1 = plt.subplots(figsize=(8, 5))
sns.scatterplot(x=y_test, y=y_pred, ax=ax1, alpha=0.7)
ax1.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], '--r')
ax1.set_xlabel("Actual MEDV")
ax1.set_ylabel("Predicted MEDV")
ax1.set_title("Actual vs Predicted Home Values")
st.pyplot(fig1)
# Residuals Plot
st.markdown("### πŸ”§ Residual Plot (Errors)")
residuals = y_test - y_pred
fig2, ax2 = plt.subplots(figsize=(8, 5))
sns.histplot(residuals, kde=True, ax=ax2, color="purple")
ax2.set_title("Distribution of Residuals")
ax2.set_xlabel("Error (Actual - Predicted)")
st.pyplot(fig2)
# Summary
st.markdown("""
---
## πŸ“š Key Takeaways
- **Linear Regression** is great for understanding relationships and making simple predictions.
- **Coefficients** show how each feature affects the target.
- **Residuals** help assess how well the model fits the data.
### βœ… Use Linear Regression when:
- The outcome is **continuous**
- There’s a **linear trend**
- You need **interpretability** over complexity
🎯 *Pro Tip:* Try removing or combining features and observe how it affects accuracy and residuals!
""")