V8055 commited on
Commit
26507b3
·
verified ·
1 Parent(s): ea5ef0b

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +46 -96
app.py CHANGED
@@ -2,82 +2,57 @@ import streamlit as st
2
  import pandas as pd
3
  import numpy as np
4
  from sklearn.model_selection import train_test_split
5
- from sklearn.preprocessing import StandardScaler, LabelEncoder
6
  from sklearn.linear_model import LinearRegression
7
  from sklearn.ensemble import RandomForestRegressor
8
  from sklearn.metrics import mean_squared_error, r2_score
9
  import matplotlib.pyplot as plt
10
  import seaborn as sns
11
 
12
-
13
  def analyze_data(data):
14
- """
15
- Perform initial data analysis
16
- """
17
- # Check for missing values
18
- st.write("Missing values:")
19
  st.write(data.isnull().sum())
20
-
21
- # Display statistical summary
22
- st.write("Statistical summary:")
23
  st.write(data.describe())
24
-
25
- # Visualize correlation matrix for numeric columns
26
  numeric_data = data.select_dtypes(include=['number'])
27
  if not numeric_data.empty:
28
- st.write("Correlation Matrix:")
29
  plt.figure(figsize=(10, 8))
30
  sns.heatmap(numeric_data.corr(), annot=True, cmap='coolwarm', center=0)
31
  st.pyplot(plt)
32
 
33
 
34
- def preprocess_data(data, target_column):
35
- """
36
- Preprocess the data: Handle categorical variables, missing values, and scale numeric features
37
- """
38
- # Fill missing values
39
- data.fillna(data.mean(), inplace=True)
40
-
41
- # Separate numeric and categorical columns
42
  numeric_columns = data.select_dtypes(include=['int64', 'float64']).columns
43
- categorical_columns = data.select_dtypes(include=['object']).columns
44
-
45
- # Encode categorical columns
46
- for col in categorical_columns:
47
- label_encoder = LabelEncoder()
48
- data[col] = label_encoder.fit_transform(data[col])
49
 
50
- # Separate features and target
51
- X = data.drop(columns=[target_column])
52
- y = data[target_column]
53
 
54
- # Scale numeric features
55
  scaler = StandardScaler()
56
- X[numeric_columns] = scaler.fit_transform(X[numeric_columns])
 
 
57
 
58
- return X, y
59
 
60
-
61
- def train_and_evaluate_models(X_train, X_test, y_train, y_test, feature_names):
62
- """
63
- Train and evaluate multiple models
64
- """
65
  models = {
66
  'Linear Regression': LinearRegression(),
67
  'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42)
68
  }
69
 
70
  results = {}
71
-
72
  for name, model in models.items():
73
- # Train the model
74
- model.fit(X_train, y_train)
75
-
76
- # Make predictions
77
- train_pred = model.predict(X_train)
78
- test_pred = model.predict(X_test)
79
 
80
- # Calculate metrics
81
  results[name] = {
82
  'model': model,
83
  'train_rmse': np.sqrt(mean_squared_error(y_train, train_pred)),
@@ -86,81 +61,56 @@ def train_and_evaluate_models(X_train, X_test, y_train, y_test, feature_names):
86
  'test_r2': r2_score(y_test, test_pred)
87
  }
88
 
89
- # Display results
90
- st.write(f"{name} Results:")
91
- st.write(f"Training RMSE: {results[name]['train_rmse']:.2f}")
92
- st.write(f"Test RMSE: {results[name]['test_rmse']:.2f}")
93
- st.write(f"Training R²: {results[name]['train_r2']:.3f}")
94
- st.write(f"Test R²: {results[name]['test_r2']:.3f}")
95
-
96
- # Plot predictions
97
- plot_predictions(model, X_test, y_test, f"{name} Predictions vs Actual Values")
98
 
99
- # Feature importance for Random Forest
100
  if name == 'Random Forest':
101
  feature_importance = pd.DataFrame({
102
- 'feature': feature_names,
103
- 'importance': model.feature_importances_
104
- }).sort_values('importance', ascending=False)
105
- st.write("Feature Importance (Random Forest):")
106
  st.write(feature_importance)
107
 
108
- # Plot feature importance
109
  plt.figure(figsize=(10, 6))
110
- sns.barplot(x='importance', y='feature', data=feature_importance)
111
- plt.title('Feature Importance (Random Forest)')
112
  st.pyplot(plt)
113
 
114
  return results
115
 
116
 
117
- def plot_predictions(model, X_test, y_test, title):
118
- """
119
- Plot actual vs predicted values
120
- """
121
- predictions = model.predict(X_test)
122
-
123
- plt.figure(figsize=(10, 6))
124
- plt.scatter(y_test, predictions, alpha=0.5)
125
- plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
126
- plt.xlabel('Actual Values')
127
- plt.ylabel('Predicted Values')
128
- plt.title(title)
129
- st.pyplot(plt)
130
-
131
-
132
  def main():
133
- st.title("Machine Learning Model Training and Evaluation")
134
 
135
  uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])
136
-
137
  if uploaded_file:
138
- # Load the dataset
139
  data = pd.read_csv(uploaded_file)
 
 
140
 
141
  # Analyze the data
142
- st.subheader("Data Analysis")
143
  analyze_data(data)
144
 
145
- # Select target column
146
- target_column = st.selectbox("Select the target column:", data.columns)
147
 
148
- if target_column:
149
- # Preprocess the data
150
- X, y = preprocess_data(data, target_column)
151
 
152
- # Split the data
153
- test_size = st.slider("Select test data size:", 0.1, 0.5, 0.2)
154
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
155
 
156
- # Train and evaluate models
157
- st.subheader("Model Training and Evaluation")
158
- results = train_and_evaluate_models(X_train, X_test, y_train, y_test, X.columns)
159
-
160
- st.write("Training and evaluation completed!")
161
 
162
 
 
163
  if __name__ == "__main__":
164
  main()
165
 
166
-
 
2
  import pandas as pd
3
  import numpy as np
4
  from sklearn.model_selection import train_test_split
5
+ from sklearn.preprocessing import StandardScaler
6
  from sklearn.linear_model import LinearRegression
7
  from sklearn.ensemble import RandomForestRegressor
8
  from sklearn.metrics import mean_squared_error, r2_score
9
  import matplotlib.pyplot as plt
10
  import seaborn as sns
11
 
12
+ # Function Definitions
13
  def analyze_data(data):
14
+ st.write("### Data Analysis")
15
+ st.write("**Missing Values:**")
 
 
 
16
  st.write(data.isnull().sum())
17
+ st.write("**Statistical Summary:**")
 
 
18
  st.write(data.describe())
19
+
20
+ # Correlation matrix
21
  numeric_data = data.select_dtypes(include=['number'])
22
  if not numeric_data.empty:
23
+ st.write("**Correlation Matrix:**")
24
  plt.figure(figsize=(10, 8))
25
  sns.heatmap(numeric_data.corr(), annot=True, cmap='coolwarm', center=0)
26
  st.pyplot(plt)
27
 
28
 
29
+ def prepare_data(data):
 
 
 
 
 
 
 
30
  numeric_columns = data.select_dtypes(include=['int64', 'float64']).columns
31
+ X = data[numeric_columns[:-1]]
32
+ y = data[numeric_columns[-1]]
33
+ return X, y
 
 
 
34
 
 
 
 
35
 
36
+ def preprocess_data(X_train, X_test):
37
  scaler = StandardScaler()
38
+ X_train_scaled = scaler.fit_transform(X_train)
39
+ X_test_scaled = scaler.transform(X_test)
40
+ return X_train_scaled, X_test_scaled, scaler
41
 
 
42
 
43
+ def train_and_evaluate_models(X_train_scaled, X_test_scaled, y_train, y_test, feature_names):
 
 
 
 
44
  models = {
45
  'Linear Regression': LinearRegression(),
46
  'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42)
47
  }
48
 
49
  results = {}
50
+
51
  for name, model in models.items():
52
+ model.fit(X_train_scaled, y_train)
53
+ train_pred = model.predict(X_train_scaled)
54
+ test_pred = model.predict(X_test_scaled)
 
 
 
55
 
 
56
  results[name] = {
57
  'model': model,
58
  'train_rmse': np.sqrt(mean_squared_error(y_train, train_pred)),
 
61
  'test_r2': r2_score(y_test, test_pred)
62
  }
63
 
64
+ st.write(f"### {name} Results:")
65
+ st.write(f"**Training RMSE:** {results[name]['train_rmse']:.2f}")
66
+ st.write(f"**Test RMSE:** {results[name]['test_rmse']:.2f}")
67
+ st.write(f"**Training R²:** {results[name]['train_r2']:.3f}")
68
+ st.write(f"**Test R²:** {results[name]['test_r2']:.3f}")
 
 
 
 
69
 
 
70
  if name == 'Random Forest':
71
  feature_importance = pd.DataFrame({
72
+ 'Feature': feature_names,
73
+ 'Importance': model.feature_importances_
74
+ }).sort_values('Importance', ascending=False)
75
+ st.write("**Feature Importance:**")
76
  st.write(feature_importance)
77
 
 
78
  plt.figure(figsize=(10, 6))
79
+ sns.barplot(x='Importance', y='Feature', data=feature_importance)
80
+ plt.title('Feature Importance')
81
  st.pyplot(plt)
82
 
83
  return results
84
 
85
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
86
  def main():
87
+ st.title("Housing Price Prediction")
88
 
89
  uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])
 
90
  if uploaded_file:
 
91
  data = pd.read_csv(uploaded_file)
92
+ st.write("## Dataset Overview")
93
+ st.write(data.head())
94
 
95
  # Analyze the data
 
96
  analyze_data(data)
97
 
98
+ # Prepare the data
99
+ X, y = prepare_data(data)
100
 
101
+ # Train-test split
102
+ test_size = st.slider("Test data size:", 0.1, 0.5, 0.2)
103
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
104
 
105
+ # Preprocess the data
106
+ X_train_scaled, X_test_scaled, scaler = preprocess_data(X_train, X_test)
 
107
 
108
+ # Train and evaluate models
109
+ st.write("## Model Training and Evaluation")
110
+ train_and_evaluate_models(X_train_scaled, X_test_scaled, y_train, y_test, X_train.columns)
 
 
111
 
112
 
113
+ # Run the app
114
  if __name__ == "__main__":
115
  main()
116