File size: 3,873 Bytes
3e13577
 
 
 
26507b3
3e13577
 
 
 
 
 
26507b3
ea5ef0b
26507b3
 
ea5ef0b
26507b3
ea5ef0b
26507b3
 
ea5ef0b
 
26507b3
d99f5b5
 
ea5ef0b
 
 
26507b3
ea5ef0b
26507b3
 
 
3e13577
ea5ef0b
26507b3
3e13577
26507b3
 
 
ea5ef0b
 
26507b3
ea5ef0b
 
 
 
 
 
26507b3
ea5ef0b
26507b3
 
 
ea5ef0b
 
 
 
 
 
 
 
 
26507b3
 
 
 
 
ea5ef0b
 
 
26507b3
 
 
 
ea5ef0b
 
 
26507b3
 
ea5ef0b
 
 
 
 
 
26507b3
ea5ef0b
 
 
 
26507b3
 
ea5ef0b
 
 
 
26507b3
 
ea5ef0b
26507b3
 
 
ea5ef0b
26507b3
 
ea5ef0b
26507b3
 
 
ea5ef0b
 
26507b3
ea5ef0b
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
import matplotlib.pyplot as plt
import seaborn as sns

# Function Definitions
def analyze_data(data):
    st.write("### Data Analysis")
    st.write("**Missing Values:**")
    st.write(data.isnull().sum())
    st.write("**Statistical Summary:**")
    st.write(data.describe())

    # Correlation matrix
    numeric_data = data.select_dtypes(include=['number'])
    if not numeric_data.empty:
        st.write("**Correlation Matrix:**")
        plt.figure(figsize=(10, 8))
        sns.heatmap(numeric_data.corr(), annot=True, cmap='coolwarm', center=0)
        st.pyplot(plt)


def prepare_data(data):
    numeric_columns = data.select_dtypes(include=['int64', 'float64']).columns
    X = data[numeric_columns[:-1]]
    y = data[numeric_columns[-1]]
    return X, y


def preprocess_data(X_train, X_test):
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    return X_train_scaled, X_test_scaled, scaler


def train_and_evaluate_models(X_train_scaled, X_test_scaled, y_train, y_test, feature_names):
    models = {
        'Linear Regression': LinearRegression(),
        'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42)
    }
    
    results = {}
    
    for name, model in models.items():
        model.fit(X_train_scaled, y_train)
        train_pred = model.predict(X_train_scaled)
        test_pred = model.predict(X_test_scaled)

        results[name] = {
            'model': model,
            'train_rmse': np.sqrt(mean_squared_error(y_train, train_pred)),
            'test_rmse': np.sqrt(mean_squared_error(y_test, test_pred)),
            'train_r2': r2_score(y_train, train_pred),
            'test_r2': r2_score(y_test, test_pred)
        }

        st.write(f"### {name} Results:")
        st.write(f"**Training RMSE:** {results[name]['train_rmse']:.2f}")
        st.write(f"**Test RMSE:** {results[name]['test_rmse']:.2f}")
        st.write(f"**Training R²:** {results[name]['train_r2']:.3f}")
        st.write(f"**Test R²:** {results[name]['test_r2']:.3f}")

        if name == 'Random Forest':
            feature_importance = pd.DataFrame({
                'Feature': feature_names,
                'Importance': model.feature_importances_
            }).sort_values('Importance', ascending=False)
            st.write("**Feature Importance:**")
            st.write(feature_importance)

            plt.figure(figsize=(10, 6))
            sns.barplot(x='Importance', y='Feature', data=feature_importance)
            plt.title('Feature Importance')
            st.pyplot(plt)

    return results


def main():
    st.title("Housing Price Prediction")

    uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])
    if uploaded_file:
        data = pd.read_csv(uploaded_file)
        st.write("## Dataset Overview")
        st.write(data.head())

        # Analyze the data
        analyze_data(data)

        # Prepare the data
        X, y = prepare_data(data)

        # Train-test split
        test_size = st.slider("Test data size:", 0.1, 0.5, 0.2)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)

        # Preprocess the data
        X_train_scaled, X_test_scaled, scaler = preprocess_data(X_train, X_test)

        # Train and evaluate models
        st.write("## Model Training and Evaluation")
        train_and_evaluate_models(X_train_scaled, X_test_scaled, y_train, y_test, X_train.columns)


# Run the app
if __name__ == "__main__":
    main()