import streamlit as st import pandas as pd import numpy as np from sklearn.datasets import fetch_openml from sklearn.model_selection import train_test_split from sklearn.linear_model import LinearRegression from sklearn.preprocessing import StandardScaler from sklearn.metrics import mean_squared_error, r2_score import matplotlib.pyplot as plt import seaborn as sns # Page setup st.set_page_config(page_title="Explore Linear Regression", layout="wide") st.title("Linear Regression") # Intro st.markdown(""" ## πŸ“˜ What is Linear Regression? Linear Regression models the relationship between a continuous outcome and one or more input variables (features). **Equation:** \[ \\hat{y} = w_1x_1 + w_2x_2 + ... + w_nx_n + b \] It tries to find the line (or hyperplane) that best fits the data. --- """) # Load dataset from OpenML (Boston housing) @st.cache_data def load_data(): boston = fetch_openml(name="boston", version=1, as_frame=True) df = boston.frame return df df = load_data() st.subheader("🏠 Dataset: Boston Housing Prices") st.markdown("This dataset contains information about houses in Boston suburbs and aims to predict the **median value of owner-occupied homes**.") st.dataframe(df.head(), use_container_width=True) # Feature selection target_col = "MEDV" X = df.drop(columns=target_col) y = df[target_col] # Feature scaling scaler = StandardScaler() X_scaled = scaler.fit_transform(X) # Train-test split X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42) # Model training model = LinearRegression() model.fit(X_train, y_train) y_pred = model.predict(X_test) # Evaluation mse = mean_squared_error(y_test, y_pred) r2 = r2_score(y_test, y_pred) st.success(f"πŸ“ Model Performance: RΒ² = {r2:.2f}, MSE = {mse:.2f}") # Feature coefficients st.markdown("### πŸ” Coefficients (Feature Importance)") coef_df = pd.DataFrame({ "Feature": X.columns, "Coefficient": model.coef_ }).sort_values(by="Coefficient", key=abs, ascending=False) st.dataframe(coef_df, use_container_width=True) # Actual vs Predicted Plot st.markdown("### πŸ“ˆ Actual vs Predicted Home Prices") fig1, ax1 = plt.subplots(figsize=(8, 5)) sns.scatterplot(x=y_test, y=y_pred, ax=ax1, alpha=0.7) ax1.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], '--r') ax1.set_xlabel("Actual MEDV") ax1.set_ylabel("Predicted MEDV") ax1.set_title("Actual vs Predicted Home Values") st.pyplot(fig1) # Residuals Plot st.markdown("### πŸ”§ Residual Plot (Errors)") residuals = y_test - y_pred fig2, ax2 = plt.subplots(figsize=(8, 5)) sns.histplot(residuals, kde=True, ax=ax2, color="purple") ax2.set_title("Distribution of Residuals") ax2.set_xlabel("Error (Actual - Predicted)") st.pyplot(fig2) # Summary st.markdown(""" --- ## πŸ“š Key Takeaways - **Linear Regression** is great for understanding relationships and making simple predictions. - **Coefficients** show how each feature affects the target. - **Residuals** help assess how well the model fits the data. ### βœ… Use Linear Regression when: - The outcome is **continuous** - There’s a **linear trend** - You need **interpretability** over complexity 🎯 *Pro Tip:* Try removing or combining features and observe how it affects accuracy and residuals! """)