V8055 commited on
Commit
d99f5b5
·
verified ·
1 Parent(s): cc93377

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +71 -126
app.py CHANGED
@@ -9,144 +9,89 @@ from sklearn.metrics import mean_squared_error, r2_score
9
  import matplotlib.pyplot as plt
10
  import seaborn as sns
11
 
12
- def analyze_data(data):
13
- """
14
- Perform initial data analysis
15
- """
16
- # Check for missing values
17
- print("\nMissing values:")
18
- print(data.isnull().sum())
19
-
20
- # Display statistical summary
21
- print("\nStatistical summary:")
22
- print(data.describe())
23
 
24
- # Visualize distribution of target variable
25
- numeric_data = data.select_dtypes(include=['number'])
26
 
27
- # Create correlation matrix
28
- plt.figure(figsize=(10, 8))
29
- sns.heatmap(numeric_data.corr(), annot=True, cmap='coolwarm', center=0)
30
- plt.title('Correlation Matrix')
31
- plt.tight_layout()
32
- plt.show()
 
 
33
 
34
- def prepare_data(data):
35
- """
36
- Prepare the data for modeling
37
- """
38
- # Identify numeric columns
39
- numeric_columns = data.select_dtypes(include=['int64', 'float64']).columns
40
-
41
- # Separate features and target
42
- # Assuming the last column is the price/target variable
43
- X = data[numeric_columns[:-1]]
44
- y = data[numeric_columns[-1]]
45
-
46
- return X, y
47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
 
 
 
 
49
 
50
- def preprocess_data(X_train, X_test):
51
- """
52
- Scale the features using StandardScaler
53
- """
54
  scaler = StandardScaler()
55
  X_train_scaled = scaler.fit_transform(X_train)
56
  X_test_scaled = scaler.transform(X_test)
57
-
58
- return X_train_scaled, X_test_scaled, scaler
59
 
60
- def train_and_evaluate_models(X_train_scaled, X_test_scaled, y_train, y_test, X_train):
61
- """
62
- Train and evaluate multiple models
63
- """
64
- models = {
65
- 'Linear Regression': LinearRegression(),
66
- 'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42)
67
- }
68
-
69
- results = {}
70
-
71
- for name, model in models.items():
72
- # Train model
73
- model.fit(X_train_scaled, y_train)
74
 
75
- # Make predictions
76
- train_pred = model.predict(X_train_scaled)
77
- test_pred = model.predict(X_test_scaled)
 
 
 
 
 
 
 
 
78
 
79
- # Calculate metrics
80
- results[name] = {
81
- 'model': model,
82
- 'train_rmse': np.sqrt(mean_squared_error(y_train, train_pred)),
83
- 'test_rmse': np.sqrt(mean_squared_error(y_test, test_pred)),
84
- 'train_r2': r2_score(y_train, train_pred),
85
- 'test_r2': r2_score(y_test, test_pred)
86
- }
 
 
 
87
 
88
- # Feature importance for Random Forest
89
- if name == 'Random Forest':
90
- feature_importance = pd.DataFrame({
91
- 'feature': X_train.columns,
92
- 'importance': model.feature_importances_
93
- }).sort_values('importance', ascending=False)
94
- print(f"\nFeature Importance:")
95
- print(feature_importance)
96
-
97
- # Plot feature importance
98
- plt.figure(figsize=(10, 6))
99
- sns.barplot(x='importance', y='feature', data=feature_importance)
100
- plt.title('Feature Importance (Random Forest)')
101
- plt.tight_layout()
102
- plt.show()
103
-
104
- return results
105
-
106
 
107
- def plot_predictions(model, X_test_scaled, y_test, title):
108
- """
109
- Plot actual vs predicted values
110
- """
111
- predictions = model.predict(X_test_scaled)
112
-
113
- plt.figure(figsize=(10, 6))
114
- plt.scatter(y_test, predictions, alpha=0.5)
115
- plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
116
- plt.xlabel('Actual Prices')
117
- plt.ylabel('Predicted Prices')
118
- plt.title(title)
119
- plt.tight_layout()
120
- plt.show()
121
-
122
- def main(data):
123
- # Analyze the data
124
- analyze_data(data)
125
-
126
- # Prepare the data
127
- X, y = prepare_data(data)
128
-
129
- # Split the data
130
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
131
-
132
- # Preprocess the data
133
- X_train_scaled, X_test_scaled, scaler = preprocess_data(X_train, X_test)
134
-
135
- # Train and evaluate models
136
- results = train_and_evaluate_models(X_train_scaled, X_test_scaled, y_train, y_test, X_train)
137
-
138
- # Print results
139
- for name, metrics in results.items():
140
- print(f"\n{name} Results:")
141
- print(f"Training RMSE: ${metrics['train_rmse']:.2f}")
142
- print(f"Test RMSE: ${metrics['test_rmse']:.2f}")
143
- print(f"Training R²: {metrics['train_r2']:.3f}")
144
- print(f"Test R²: {metrics['test_r2']:.3f}")
145
-
146
- # Plot predictions
147
- plot_predictions(metrics['model'], X_test_scaled, y_test, f"{name} Predictions vs Actual Values")
148
-
149
- return results
150
 
151
- # Run the analysis and modeling
152
- results = main(data)
 
9
  import matplotlib.pyplot as plt
10
  import seaborn as sns
11
 
12
+ # Streamlit setup
13
+ st.title("ML Model Training and Evaluation App")
14
+ st.write("This app allows you to upload data, analyze it, train ML models, and visualize results.")
 
 
 
 
 
 
 
 
15
 
16
+ # Upload dataset
17
+ uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])
18
 
19
+ # Sidebar settings
20
+ test_size = st.sidebar.slider("Test Size (Train/Test Split)", 0.1, 0.5, 0.2)
21
+ random_state = st.sidebar.number_input("Random State", min_value=0, max_value=100, value=42)
22
+ models_to_train = st.sidebar.multiselect(
23
+ "Select Models to Train",
24
+ ["Linear Regression", "Random Forest"],
25
+ ["Linear Regression", "Random Forest"]
26
+ )
27
 
28
+ if uploaded_file:
29
+ # Load the dataset
30
+ data = pd.read_csv(uploaded_file)
31
+ st.write("Dataset Preview:")
32
+ st.dataframe(data.head())
 
 
 
 
 
 
 
 
33
 
34
+ # Analyze the data
35
+ if st.checkbox("Show Data Analysis"):
36
+ st.write("Missing Values:")
37
+ st.write(data.isnull().sum())
38
+
39
+ st.write("Statistical Summary:")
40
+ st.write(data.describe())
41
+
42
+ st.write("Correlation Matrix:")
43
+ numeric_data = data.select_dtypes(include=['number'])
44
+ plt.figure(figsize=(10, 8))
45
+ sns.heatmap(numeric_data.corr(), annot=True, cmap='coolwarm', center=0)
46
+ st.pyplot()
47
 
48
+ # Prepare the data
49
+ X, y = data.iloc[:, :-1], data.iloc[:, -1]
50
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
51
 
52
+ # Scale the data
 
 
 
53
  scaler = StandardScaler()
54
  X_train_scaled = scaler.fit_transform(X_train)
55
  X_test_scaled = scaler.transform(X_test)
 
 
56
 
57
+ # Train and evaluate models
58
+ if st.button("Train Models"):
59
+ results = {}
 
 
 
 
 
 
 
 
 
 
 
60
 
61
+ if "Linear Regression" in models_to_train:
62
+ lr = LinearRegression()
63
+ lr.fit(X_train_scaled, y_train)
64
+ y_pred_train = lr.predict(X_train_scaled)
65
+ y_pred_test = lr.predict(X_test_scaled)
66
+ results["Linear Regression"] = {
67
+ "Train RMSE": np.sqrt(mean_squared_error(y_train, y_pred_train)),
68
+ "Test RMSE": np.sqrt(mean_squared_error(y_test, y_pred_test)),
69
+ "Train R²": r2_score(y_train, y_pred_train),
70
+ "Test R²": r2_score(y_test, y_pred_test)
71
+ }
72
 
73
+ if "Random Forest" in models_to_train:
74
+ rf = RandomForestRegressor(random_state=random_state, n_estimators=100)
75
+ rf.fit(X_train_scaled, y_train)
76
+ y_pred_train = rf.predict(X_train_scaled)
77
+ y_pred_test = rf.predict(X_test_scaled)
78
+ results["Random Forest"] = {
79
+ "Train RMSE": np.sqrt(mean_squared_error(y_train, y_pred_train)),
80
+ "Test RMSE": np.sqrt(mean_squared_error(y_test, y_pred_test)),
81
+ "Train R²": r2_score(y_train, y_pred_train),
82
+ "Test R²": r2_score(y_test, y_pred_test)
83
+ }
84
 
85
+ st.write("Model Results:")
86
+ st.json(results)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
 
88
+ # Optional: Plot actual vs predicted for Random Forest
89
+ if "Random Forest" in results:
90
+ plt.figure(figsize=(8, 6))
91
+ plt.scatter(y_test, rf.predict(X_test_scaled), alpha=0.5)
92
+ plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
93
+ plt.xlabel("Actual")
94
+ plt.ylabel("Predicted")
95
+ plt.title("Random Forest: Actual vs Predicted")
96
+ st.pyplot()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97