import streamlit as st
import pandas as pd
import numpy as np
from sklearn.datasets import fetch_openml
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

# Page setup
st.set_page_config(page_title="Explore Linear Regression", layout="wide")
st.title("Linear Regression")

# Intro
st.markdown("""
## 📘 What is Linear Regression?

Linear Regression models the relationship between a continuous outcome and one or more input variables (features).

**Equation:**
\[
\\hat{y} = w_1x_1 + w_2x_2 + ... + w_nx_n + b
\]

It tries to find the line (or hyperplane) that best fits the data.

---  
""")

# Load dataset from OpenML (Boston housing)
@st.cache_data
def load_data():
    boston = fetch_openml(name="boston", version=1, as_frame=True)
    df = boston.frame
    return df

df = load_data()

st.subheader("🏠 Dataset: Boston Housing Prices")
st.markdown("This dataset contains information about houses in Boston suburbs and aims to predict the **median value of owner-occupied homes**.")
st.dataframe(df.head(), use_container_width=True)

# Feature selection
target_col = "MEDV"
X = df.drop(columns=target_col)
y = df[target_col]

# Feature scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Model training
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

# Evaluation
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

st.success(f"📏 Model Performance: R² = {r2:.2f}, MSE = {mse:.2f}")

# Feature coefficients
st.markdown("### 🔍 Coefficients (Feature Importance)")
coef_df = pd.DataFrame({
    "Feature": X.columns,
    "Coefficient": model.coef_
}).sort_values(by="Coefficient", key=abs, ascending=False)

st.dataframe(coef_df, use_container_width=True)

# Actual vs Predicted Plot
st.markdown("### 📈 Actual vs Predicted Home Prices")
fig1, ax1 = plt.subplots(figsize=(8, 5))
sns.scatterplot(x=y_test, y=y_pred, ax=ax1, alpha=0.7)
ax1.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], '--r')
ax1.set_xlabel("Actual MEDV")
ax1.set_ylabel("Predicted MEDV")
ax1.set_title("Actual vs Predicted Home Values")
st.pyplot(fig1)

# Residuals Plot
st.markdown("### 🔧 Residual Plot (Errors)")
residuals = y_test - y_pred
fig2, ax2 = plt.subplots(figsize=(8, 5))
sns.histplot(residuals, kde=True, ax=ax2, color="purple")
ax2.set_title("Distribution of Residuals")
ax2.set_xlabel("Error (Actual - Predicted)")
st.pyplot(fig2)

# Summary
st.markdown("""
---
## 📚 Key Takeaways

- **Linear Regression** is great for understanding relationships and making simple predictions.
- **Coefficients** show how each feature affects the target.
- **Residuals** help assess how well the model fits the data.

### ✅ Use Linear Regression when:
- The outcome is **continuous**
- There’s a **linear trend**
- You need **interpretability** over complexity

🎯 *Pro Tip:* Try removing or combining features and observe how it affects accuracy and residuals!
""")