import streamlit as st import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LinearRegression from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import mean_squared_error, r2_score import matplotlib.pyplot as plt import seaborn as sns # Function Definitions def analyze_data(data): st.write("### Data Analysis") st.write("**Missing Values:**") st.write(data.isnull().sum()) st.write("**Statistical Summary:**") st.write(data.describe()) # Correlation matrix numeric_data = data.select_dtypes(include=['number']) if not numeric_data.empty: st.write("**Correlation Matrix:**") plt.figure(figsize=(10, 8)) sns.heatmap(numeric_data.corr(), annot=True, cmap='coolwarm', center=0) st.pyplot(plt) def prepare_data(data): numeric_columns = data.select_dtypes(include=['int64', 'float64']).columns X = data[numeric_columns[:-1]] y = data[numeric_columns[-1]] return X, y def preprocess_data(X_train, X_test): scaler = StandardScaler() X_train_scaled = scaler.fit_transform(X_train) X_test_scaled = scaler.transform(X_test) return X_train_scaled, X_test_scaled, scaler def train_and_evaluate_models(X_train_scaled, X_test_scaled, y_train, y_test, feature_names): models = { 'Linear Regression': LinearRegression(), 'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42) } results = {} for name, model in models.items(): model.fit(X_train_scaled, y_train) train_pred = model.predict(X_train_scaled) test_pred = model.predict(X_test_scaled) results[name] = { 'model': model, 'train_rmse': np.sqrt(mean_squared_error(y_train, train_pred)), 'test_rmse': np.sqrt(mean_squared_error(y_test, test_pred)), 'train_r2': r2_score(y_train, train_pred), 'test_r2': r2_score(y_test, test_pred) } st.write(f"### {name} Results:") st.write(f"**Training RMSE:** {results[name]['train_rmse']:.2f}") st.write(f"**Test RMSE:** {results[name]['test_rmse']:.2f}") st.write(f"**Training R²:** {results[name]['train_r2']:.3f}") st.write(f"**Test R²:** {results[name]['test_r2']:.3f}") if name == 'Random Forest': feature_importance = pd.DataFrame({ 'Feature': feature_names, 'Importance': model.feature_importances_ }).sort_values('Importance', ascending=False) st.write("**Feature Importance:**") st.write(feature_importance) plt.figure(figsize=(10, 6)) sns.barplot(x='Importance', y='Feature', data=feature_importance) plt.title('Feature Importance') st.pyplot(plt) return results def main(): st.title("Housing Price Prediction") uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"]) if uploaded_file: data = pd.read_csv(uploaded_file) st.write("## Dataset Overview") st.write(data.head()) # Analyze the data analyze_data(data) # Prepare the data X, y = prepare_data(data) # Train-test split test_size = st.slider("Test data size:", 0.1, 0.5, 0.2) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42) # Preprocess the data X_train_scaled, X_test_scaled, scaler = preprocess_data(X_train, X_test) # Train and evaluate models st.write("## Model Training and Evaluation") train_and_evaluate_models(X_train_scaled, X_test_scaled, y_train, y_test, X_train.columns) # Run the app if __name__ == "__main__": main()