|
|
import streamlit as st |
|
|
import pandas as pd |
|
|
import numpy as np |
|
|
from sklearn.model_selection import train_test_split |
|
|
from sklearn.preprocessing import StandardScaler |
|
|
from sklearn.linear_model import LinearRegression |
|
|
from sklearn.ensemble import RandomForestRegressor |
|
|
from sklearn.metrics import mean_squared_error, r2_score |
|
|
import matplotlib.pyplot as plt |
|
|
import seaborn as sns |
|
|
|
|
|
|
|
|
def analyze_data(data): |
|
|
st.write("### Data Analysis") |
|
|
st.write("**Missing Values:**") |
|
|
st.write(data.isnull().sum()) |
|
|
st.write("**Statistical Summary:**") |
|
|
st.write(data.describe()) |
|
|
|
|
|
|
|
|
numeric_data = data.select_dtypes(include=['number']) |
|
|
if not numeric_data.empty: |
|
|
st.write("**Correlation Matrix:**") |
|
|
plt.figure(figsize=(10, 8)) |
|
|
sns.heatmap(numeric_data.corr(), annot=True, cmap='coolwarm', center=0) |
|
|
st.pyplot(plt) |
|
|
|
|
|
|
|
|
def prepare_data(data): |
|
|
numeric_columns = data.select_dtypes(include=['int64', 'float64']).columns |
|
|
X = data[numeric_columns[:-1]] |
|
|
y = data[numeric_columns[-1]] |
|
|
return X, y |
|
|
|
|
|
|
|
|
def preprocess_data(X_train, X_test): |
|
|
scaler = StandardScaler() |
|
|
X_train_scaled = scaler.fit_transform(X_train) |
|
|
X_test_scaled = scaler.transform(X_test) |
|
|
return X_train_scaled, X_test_scaled, scaler |
|
|
|
|
|
|
|
|
def train_and_evaluate_models(X_train_scaled, X_test_scaled, y_train, y_test, feature_names): |
|
|
models = { |
|
|
'Linear Regression': LinearRegression(), |
|
|
'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42) |
|
|
} |
|
|
|
|
|
results = {} |
|
|
|
|
|
for name, model in models.items(): |
|
|
model.fit(X_train_scaled, y_train) |
|
|
train_pred = model.predict(X_train_scaled) |
|
|
test_pred = model.predict(X_test_scaled) |
|
|
|
|
|
results[name] = { |
|
|
'model': model, |
|
|
'train_rmse': np.sqrt(mean_squared_error(y_train, train_pred)), |
|
|
'test_rmse': np.sqrt(mean_squared_error(y_test, test_pred)), |
|
|
'train_r2': r2_score(y_train, train_pred), |
|
|
'test_r2': r2_score(y_test, test_pred) |
|
|
} |
|
|
|
|
|
st.write(f"### {name} Results:") |
|
|
st.write(f"**Training RMSE:** {results[name]['train_rmse']:.2f}") |
|
|
st.write(f"**Test RMSE:** {results[name]['test_rmse']:.2f}") |
|
|
st.write(f"**Training R²:** {results[name]['train_r2']:.3f}") |
|
|
st.write(f"**Test R²:** {results[name]['test_r2']:.3f}") |
|
|
|
|
|
if name == 'Random Forest': |
|
|
feature_importance = pd.DataFrame({ |
|
|
'Feature': feature_names, |
|
|
'Importance': model.feature_importances_ |
|
|
}).sort_values('Importance', ascending=False) |
|
|
st.write("**Feature Importance:**") |
|
|
st.write(feature_importance) |
|
|
|
|
|
plt.figure(figsize=(10, 6)) |
|
|
sns.barplot(x='Importance', y='Feature', data=feature_importance) |
|
|
plt.title('Feature Importance') |
|
|
st.pyplot(plt) |
|
|
|
|
|
return results |
|
|
|
|
|
|
|
|
def main(): |
|
|
st.title("Housing Price Prediction") |
|
|
|
|
|
uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"]) |
|
|
if uploaded_file: |
|
|
data = pd.read_csv(uploaded_file) |
|
|
st.write("## Dataset Overview") |
|
|
st.write(data.head()) |
|
|
|
|
|
|
|
|
analyze_data(data) |
|
|
|
|
|
|
|
|
X, y = prepare_data(data) |
|
|
|
|
|
|
|
|
test_size = st.slider("Test data size:", 0.1, 0.5, 0.2) |
|
|
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42) |
|
|
|
|
|
|
|
|
X_train_scaled, X_test_scaled, scaler = preprocess_data(X_train, X_test) |
|
|
|
|
|
|
|
|
st.write("## Model Training and Evaluation") |
|
|
train_and_evaluate_models(X_train_scaled, X_test_scaled, y_train, y_test, X_train.columns) |
|
|
|
|
|
|
|
|
|
|
|
if __name__ == "__main__": |
|
|
main() |
|
|
|
|
|
|