V8055 commited on
Commit
3e13577
·
verified ·
1 Parent(s): b1a1739

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +152 -0
app.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ import pandas as pd
3
+ import numpy as np
4
+ from sklearn.model_selection import train_test_split
5
+ from sklearn.preprocessing import StandardScaler
6
+ from sklearn.linear_model import LinearRegression
7
+ from sklearn.ensemble import RandomForestRegressor
8
+ from sklearn.metrics import mean_squared_error, r2_score
9
+ import matplotlib.pyplot as plt
10
+ import seaborn as sns
11
+
12
+ def analyze_data(data):
13
+ """
14
+ Perform initial data analysis
15
+ """
16
+ # Check for missing values
17
+ print("\nMissing values:")
18
+ print(data.isnull().sum())
19
+
20
+ # Display statistical summary
21
+ print("\nStatistical summary:")
22
+ print(data.describe())
23
+
24
+ # Visualize distribution of target variable
25
+ numeric_data = data.select_dtypes(include=['number'])
26
+
27
+ # Create correlation matrix
28
+ plt.figure(figsize=(10, 8))
29
+ sns.heatmap(numeric_data.corr(), annot=True, cmap='coolwarm', center=0)
30
+ plt.title('Correlation Matrix')
31
+ plt.tight_layout()
32
+ plt.show()
33
+
34
+ def prepare_data(data):
35
+ """
36
+ Prepare the data for modeling
37
+ """
38
+ # Identify numeric columns
39
+ numeric_columns = data.select_dtypes(include=['int64', 'float64']).columns
40
+
41
+ # Separate features and target
42
+ # Assuming the last column is the price/target variable
43
+ X = data[numeric_columns[:-1]]
44
+ y = data[numeric_columns[-1]]
45
+
46
+ return X, y
47
+
48
+
49
+
50
+ def preprocess_data(X_train, X_test):
51
+ """
52
+ Scale the features using StandardScaler
53
+ """
54
+ scaler = StandardScaler()
55
+ X_train_scaled = scaler.fit_transform(X_train)
56
+ X_test_scaled = scaler.transform(X_test)
57
+
58
+ return X_train_scaled, X_test_scaled, scaler
59
+
60
+ def train_and_evaluate_models(X_train_scaled, X_test_scaled, y_train, y_test, X_train):
61
+ """
62
+ Train and evaluate multiple models
63
+ """
64
+ models = {
65
+ 'Linear Regression': LinearRegression(),
66
+ 'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42)
67
+ }
68
+
69
+ results = {}
70
+
71
+ for name, model in models.items():
72
+ # Train model
73
+ model.fit(X_train_scaled, y_train)
74
+
75
+ # Make predictions
76
+ train_pred = model.predict(X_train_scaled)
77
+ test_pred = model.predict(X_test_scaled)
78
+
79
+ # Calculate metrics
80
+ results[name] = {
81
+ 'model': model,
82
+ 'train_rmse': np.sqrt(mean_squared_error(y_train, train_pred)),
83
+ 'test_rmse': np.sqrt(mean_squared_error(y_test, test_pred)),
84
+ 'train_r2': r2_score(y_train, train_pred),
85
+ 'test_r2': r2_score(y_test, test_pred)
86
+ }
87
+
88
+ # Feature importance for Random Forest
89
+ if name == 'Random Forest':
90
+ feature_importance = pd.DataFrame({
91
+ 'feature': X_train.columns,
92
+ 'importance': model.feature_importances_
93
+ }).sort_values('importance', ascending=False)
94
+ print(f"\nFeature Importance:")
95
+ print(feature_importance)
96
+
97
+ # Plot feature importance
98
+ plt.figure(figsize=(10, 6))
99
+ sns.barplot(x='importance', y='feature', data=feature_importance)
100
+ plt.title('Feature Importance (Random Forest)')
101
+ plt.tight_layout()
102
+ plt.show()
103
+
104
+ return results
105
+
106
+
107
+ def plot_predictions(model, X_test_scaled, y_test, title):
108
+ """
109
+ Plot actual vs predicted values
110
+ """
111
+ predictions = model.predict(X_test_scaled)
112
+
113
+ plt.figure(figsize=(10, 6))
114
+ plt.scatter(y_test, predictions, alpha=0.5)
115
+ plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
116
+ plt.xlabel('Actual Prices')
117
+ plt.ylabel('Predicted Prices')
118
+ plt.title(title)
119
+ plt.tight_layout()
120
+ plt.show()
121
+
122
+ def main(data):
123
+ # Analyze the data
124
+ analyze_data(data)
125
+
126
+ # Prepare the data
127
+ X, y = prepare_data(data)
128
+
129
+ # Split the data
130
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
131
+
132
+ # Preprocess the data
133
+ X_train_scaled, X_test_scaled, scaler = preprocess_data(X_train, X_test)
134
+
135
+ # Train and evaluate models
136
+ results = train_and_evaluate_models(X_train_scaled, X_test_scaled, y_train, y_test, X_train)
137
+
138
+ # Print results
139
+ for name, metrics in results.items():
140
+ print(f"\n{name} Results:")
141
+ print(f"Training RMSE: ${metrics['train_rmse']:.2f}")
142
+ print(f"Test RMSE: ${metrics['test_rmse']:.2f}")
143
+ print(f"Training R²: {metrics['train_r2']:.3f}")
144
+ print(f"Test R²: {metrics['test_r2']:.3f}")
145
+
146
+ # Plot predictions
147
+ plot_predictions(metrics['model'], X_test_scaled, y_test, f"{name} Predictions vs Actual Values")
148
+
149
+ return results
150
+
151
+ # Run the analysis and modeling
152
+ results = main(data)