V8055 commited on
Commit
ea5ef0b
·
verified ·
1 Parent(s): d99f5b5

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +149 -80
app.py CHANGED
@@ -2,96 +2,165 @@ import streamlit as st
2
  import pandas as pd
3
  import numpy as np
4
  from sklearn.model_selection import train_test_split
5
- from sklearn.preprocessing import StandardScaler
6
  from sklearn.linear_model import LinearRegression
7
  from sklearn.ensemble import RandomForestRegressor
8
  from sklearn.metrics import mean_squared_error, r2_score
9
  import matplotlib.pyplot as plt
10
  import seaborn as sns
11
 
12
- # Streamlit setup
13
- st.title("ML Model Training and Evaluation App")
14
- st.write("This app allows you to upload data, analyze it, train ML models, and visualize results.")
15
-
16
- # Upload dataset
17
- uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])
18
-
19
- # Sidebar settings
20
- test_size = st.sidebar.slider("Test Size (Train/Test Split)", 0.1, 0.5, 0.2)
21
- random_state = st.sidebar.number_input("Random State", min_value=0, max_value=100, value=42)
22
- models_to_train = st.sidebar.multiselect(
23
- "Select Models to Train",
24
- ["Linear Regression", "Random Forest"],
25
- ["Linear Regression", "Random Forest"]
26
- )
27
-
28
- if uploaded_file:
29
- # Load the dataset
30
- data = pd.read_csv(uploaded_file)
31
- st.write("Dataset Preview:")
32
- st.dataframe(data.head())
33
-
34
- # Analyze the data
35
- if st.checkbox("Show Data Analysis"):
36
- st.write("Missing Values:")
37
- st.write(data.isnull().sum())
38
-
39
- st.write("Statistical Summary:")
40
- st.write(data.describe())
41
-
42
  st.write("Correlation Matrix:")
43
- numeric_data = data.select_dtypes(include=['number'])
44
  plt.figure(figsize=(10, 8))
45
  sns.heatmap(numeric_data.corr(), annot=True, cmap='coolwarm', center=0)
46
- st.pyplot()
 
 
 
 
 
 
 
 
 
 
 
 
47
 
48
- # Prepare the data
49
- X, y = data.iloc[:, :-1], data.iloc[:, -1]
50
- X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
 
51
 
52
- # Scale the data
 
 
 
 
53
  scaler = StandardScaler()
54
- X_train_scaled = scaler.fit_transform(X_train)
55
- X_test_scaled = scaler.transform(X_test)
56
-
57
- # Train and evaluate models
58
- if st.button("Train Models"):
59
- results = {}
60
-
61
- if "Linear Regression" in models_to_train:
62
- lr = LinearRegression()
63
- lr.fit(X_train_scaled, y_train)
64
- y_pred_train = lr.predict(X_train_scaled)
65
- y_pred_test = lr.predict(X_test_scaled)
66
- results["Linear Regression"] = {
67
- "Train RMSE": np.sqrt(mean_squared_error(y_train, y_pred_train)),
68
- "Test RMSE": np.sqrt(mean_squared_error(y_test, y_pred_test)),
69
- "Train R²": r2_score(y_train, y_pred_train),
70
- "Test R²": r2_score(y_test, y_pred_test)
71
- }
72
-
73
- if "Random Forest" in models_to_train:
74
- rf = RandomForestRegressor(random_state=random_state, n_estimators=100)
75
- rf.fit(X_train_scaled, y_train)
76
- y_pred_train = rf.predict(X_train_scaled)
77
- y_pred_test = rf.predict(X_test_scaled)
78
- results["Random Forest"] = {
79
- "Train RMSE": np.sqrt(mean_squared_error(y_train, y_pred_train)),
80
- "Test RMSE": np.sqrt(mean_squared_error(y_test, y_pred_test)),
81
- "Train R²": r2_score(y_train, y_pred_train),
82
- "Test R²": r2_score(y_test, y_pred_test)
83
- }
84
-
85
- st.write("Model Results:")
86
- st.json(results)
87
-
88
- # Optional: Plot actual vs predicted for Random Forest
89
- if "Random Forest" in results:
90
- plt.figure(figsize=(8, 6))
91
- plt.scatter(y_test, rf.predict(X_test_scaled), alpha=0.5)
92
- plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
93
- plt.xlabel("Actual")
94
- plt.ylabel("Predicted")
95
- plt.title("Random Forest: Actual vs Predicted")
96
- st.pyplot()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
97
 
 
2
  import pandas as pd
3
  import numpy as np
4
  from sklearn.model_selection import train_test_split
5
+ from sklearn.preprocessing import StandardScaler, LabelEncoder
6
  from sklearn.linear_model import LinearRegression
7
  from sklearn.ensemble import RandomForestRegressor
8
  from sklearn.metrics import mean_squared_error, r2_score
9
  import matplotlib.pyplot as plt
10
  import seaborn as sns
11
 
12
+
13
+ def analyze_data(data):
14
+ """
15
+ Perform initial data analysis
16
+ """
17
+ # Check for missing values
18
+ st.write("Missing values:")
19
+ st.write(data.isnull().sum())
20
+
21
+ # Display statistical summary
22
+ st.write("Statistical summary:")
23
+ st.write(data.describe())
24
+
25
+ # Visualize correlation matrix for numeric columns
26
+ numeric_data = data.select_dtypes(include=['number'])
27
+ if not numeric_data.empty:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28
  st.write("Correlation Matrix:")
 
29
  plt.figure(figsize=(10, 8))
30
  sns.heatmap(numeric_data.corr(), annot=True, cmap='coolwarm', center=0)
31
+ st.pyplot(plt)
32
+
33
+
34
+ def preprocess_data(data, target_column):
35
+ """
36
+ Preprocess the data: Handle categorical variables, missing values, and scale numeric features
37
+ """
38
+ # Fill missing values
39
+ data.fillna(data.mean(), inplace=True)
40
+
41
+ # Separate numeric and categorical columns
42
+ numeric_columns = data.select_dtypes(include=['int64', 'float64']).columns
43
+ categorical_columns = data.select_dtypes(include=['object']).columns
44
 
45
+ # Encode categorical columns
46
+ for col in categorical_columns:
47
+ label_encoder = LabelEncoder()
48
+ data[col] = label_encoder.fit_transform(data[col])
49
 
50
+ # Separate features and target
51
+ X = data.drop(columns=[target_column])
52
+ y = data[target_column]
53
+
54
+ # Scale numeric features
55
  scaler = StandardScaler()
56
+ X[numeric_columns] = scaler.fit_transform(X[numeric_columns])
57
+
58
+ return X, y
59
+
60
+
61
+ def train_and_evaluate_models(X_train, X_test, y_train, y_test, feature_names):
62
+ """
63
+ Train and evaluate multiple models
64
+ """
65
+ models = {
66
+ 'Linear Regression': LinearRegression(),
67
+ 'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42)
68
+ }
69
+
70
+ results = {}
71
+
72
+ for name, model in models.items():
73
+ # Train the model
74
+ model.fit(X_train, y_train)
75
+
76
+ # Make predictions
77
+ train_pred = model.predict(X_train)
78
+ test_pred = model.predict(X_test)
79
+
80
+ # Calculate metrics
81
+ results[name] = {
82
+ 'model': model,
83
+ 'train_rmse': np.sqrt(mean_squared_error(y_train, train_pred)),
84
+ 'test_rmse': np.sqrt(mean_squared_error(y_test, test_pred)),
85
+ 'train_r2': r2_score(y_train, train_pred),
86
+ 'test_r2': r2_score(y_test, test_pred)
87
+ }
88
+
89
+ # Display results
90
+ st.write(f"{name} Results:")
91
+ st.write(f"Training RMSE: {results[name]['train_rmse']:.2f}")
92
+ st.write(f"Test RMSE: {results[name]['test_rmse']:.2f}")
93
+ st.write(f"Training R²: {results[name]['train_r2']:.3f}")
94
+ st.write(f"Test R²: {results[name]['test_r2']:.3f}")
95
+
96
+ # Plot predictions
97
+ plot_predictions(model, X_test, y_test, f"{name} Predictions vs Actual Values")
98
+
99
+ # Feature importance for Random Forest
100
+ if name == 'Random Forest':
101
+ feature_importance = pd.DataFrame({
102
+ 'feature': feature_names,
103
+ 'importance': model.feature_importances_
104
+ }).sort_values('importance', ascending=False)
105
+ st.write("Feature Importance (Random Forest):")
106
+ st.write(feature_importance)
107
+
108
+ # Plot feature importance
109
+ plt.figure(figsize=(10, 6))
110
+ sns.barplot(x='importance', y='feature', data=feature_importance)
111
+ plt.title('Feature Importance (Random Forest)')
112
+ st.pyplot(plt)
113
+
114
+ return results
115
+
116
+
117
+ def plot_predictions(model, X_test, y_test, title):
118
+ """
119
+ Plot actual vs predicted values
120
+ """
121
+ predictions = model.predict(X_test)
122
+
123
+ plt.figure(figsize=(10, 6))
124
+ plt.scatter(y_test, predictions, alpha=0.5)
125
+ plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], 'r--', lw=2)
126
+ plt.xlabel('Actual Values')
127
+ plt.ylabel('Predicted Values')
128
+ plt.title(title)
129
+ st.pyplot(plt)
130
+
131
+
132
+ def main():
133
+ st.title("Machine Learning Model Training and Evaluation")
134
+
135
+ uploaded_file = st.file_uploader("Upload a CSV file", type=["csv"])
136
+
137
+ if uploaded_file:
138
+ # Load the dataset
139
+ data = pd.read_csv(uploaded_file)
140
+
141
+ # Analyze the data
142
+ st.subheader("Data Analysis")
143
+ analyze_data(data)
144
+
145
+ # Select target column
146
+ target_column = st.selectbox("Select the target column:", data.columns)
147
+
148
+ if target_column:
149
+ # Preprocess the data
150
+ X, y = preprocess_data(data, target_column)
151
+
152
+ # Split the data
153
+ test_size = st.slider("Select test data size:", 0.1, 0.5, 0.2)
154
+ X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=42)
155
+
156
+ # Train and evaluate models
157
+ st.subheader("Model Training and Evaluation")
158
+ results = train_and_evaluate_models(X_train, X_test, y_train, y_test, X.columns)
159
+
160
+ st.write("Training and evaluation completed!")
161
+
162
+
163
+ if __name__ == "__main__":
164
+ main()
165
+
166