sathishleo commited on
Commit
3182f0c
·
1 Parent(s): 5f6769b

Add app.py, backend, and model for HF Space

Browse files
Files changed (2) hide show
  1. app.py +11 -20
  2. backend/train_model.py +142 -193
app.py CHANGED
@@ -4,7 +4,7 @@ import subprocess
4
  import joblib
5
  import pandas as pd
6
  import streamlit as st
7
-
8
  NONE = None
9
  # from backend.train_model import train_model
10
 
@@ -69,27 +69,18 @@ def predict_df(df: pd.DataFrame):
69
  return None
70
  return model.predict(df[FEATURES])
71
 
72
- # ---------- Pages ----------
73
- model = joblib.load(MODEL_PATH)
74
- if page == "Train":
75
- st.subheader("🔹 Train")
76
- @st.cache_resource
77
- def load_model(path):
78
- if os.path.exists(path):
79
- model = joblib.load(path)
80
- st.sidebar.success("✅ Best model loaded")
81
- return model
82
- else:
83
- result = subprocess.run(["python", "backend/train_model.py"], capture_output=True, text=True)
84
- st.text(result.stdout)
85
- st.text(result.stderr)
86
-
87
- # Reload the trained model
88
- model = load_model(MODEL_PATH)
89
- return model
90
 
 
 
91
 
92
- model = load_model(MODEL_PATH)
 
 
 
 
93
  elif page == "Predict":
94
  st.subheader("🔹 Single Prediction")
95
 
 
4
  import joblib
5
  import pandas as pd
6
  import streamlit as st
7
+ from backend.train_model import train_model # your function
8
  NONE = None
9
  # from backend.train_model import train_model
10
 
 
69
  return None
70
  return model.predict(df[FEATURES])
71
 
72
+ # # ---------- Pages ----------
73
+ # model = joblib.load(MODEL_PATH)
74
+ st.title("Train & Predict Diabetes Model")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
75
 
76
+ if not os.path.exists(MODEL_PATH):
77
+ st.warning("No trained model found. Please train the model first.")
78
 
79
+ if st.button("Train Model"):
80
+ st.info("Training started...")
81
+ model = train_model(MODEL_PATH, REPORTS_DIR, PLOTS_DIR)
82
+ joblib.dump(model, MODEL_PATH)
83
+ st.success(f"Model trained and saved to {MODEL_PATH}")
84
  elif page == "Predict":
85
  st.subheader("🔹 Single Prediction")
86
 
backend/train_model.py CHANGED
@@ -13,8 +13,8 @@ from sklearn.model_selection import train_test_split, GridSearchCV,StratifiedKFo
13
  from sklearn.pipeline import Pipeline
14
  from sklearn.preprocessing import StandardScaler
15
  from sklearn.metrics import (
16
- accuracy_score, f1_score, precision_score, recall_score,
17
- classification_report
18
  )
19
  from sklearn.linear_model import LogisticRegression
20
  from sklearn.tree import DecisionTreeClassifier
@@ -51,197 +51,146 @@ os.makedirs(PLOTS_DIR, exist_ok=True)
51
  # raise FileNotFoundError(f"Dataset not found at {DATA_PATH}")
52
 
53
  ### Load with hugging face dataset
54
- ds = load_dataset("jonathansuru/diabetes")
55
- df = ds['train'].to_pandas()
56
- # df = pd.read_csv(DATA_PATH)
57
- X = df.drop("Outcome", axis=1)
58
- Y = df["Outcome"].astype(int)
59
- print(f"[INFO] Loaded dataset: {df.shape[0]} rows, {df.shape[1]} cols")
60
-
61
-
62
- # ------------------------------
63
- # Outlier removal (z-score)
64
- # ------------------------------
65
- z = np.abs(stats.zscore(X))
66
- non_outlier_mask = (z < 3).all(axis=1)
67
- X_clean = X[non_outlier_mask]
68
- Y_clean = Y[non_outlier_mask]
69
- print(f"[INFO] Outliers removed: {len(X) - len(X_clean)} | Clean size:{len(X_clean)}")
70
- # Variance comparison
71
- var_df = pd.DataFrame({
72
- "Before": X.var(),
73
- "After": X_clean.var()
74
- })
75
- var_df.to_csv(os.path.join(REPORTS_DIR, "variance_before_after.csv"))
76
- plt.figure(figsize=(10,5))
77
- var_df.plot(kind='bar')
78
- plt.title("Feature Variance: Before vs After Outlier Removal")
79
- plt.ylabel("Variance")
80
- plt.xticks(rotation=45, ha='right')
81
- plt.tight_layout()
82
- plt.savefig(os.path.join(PLOTS_DIR, "variance_comparison.png"),
83
- bbox_inches='tight')
84
- plt.close()
85
-
86
- # ------------------------------
87
- # Split
88
- # ------------------------------
89
-
90
- X_train, X_test, y_train, y_test = train_test_split(
91
- X_clean, Y_clean, test_size=0.2, random_state=42, stratify=Y_clean
92
- )
93
-
94
- # ------------------------------
95
- # Models + grids
96
- # ------------------------------
97
-
98
-
99
- cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
100
- models = {
101
- "LogReg_L1": Pipeline([
102
- ("scaler", StandardScaler()),
103
- ("clf", LogisticRegression(penalty="l1", solver="liblinear",
104
- max_iter=2000))
105
- ]),
106
- "LogReg_L2": Pipeline([
107
- ("scaler", StandardScaler()),
108
- ("clf", LogisticRegression(penalty="l2", solver="lbfgs",
109
- max_iter=2000))
110
- ]),
111
- "DecisionTree": DecisionTreeClassifier(random_state=42),
112
- "RandomForest": RandomForestClassifier(random_state=42),
113
- "BaggedDecisionTree": BaggingClassifier(
114
- estimator=DecisionTreeClassifier(random_state=42),
115
- n_estimators=50,
116
- random_state=42
117
- )
118
- }
119
- param_grids = {
120
- "LogReg_L1": {"clf__C": [0.01, 0.1, 1, 10]},
121
- "LogReg_L2": {"clf__C": [0.01, 0.1, 1, 10]},
122
- "DecisionTree": {"max_depth": [3, 5, 7, None], "min_samples_split": [2,
123
- 5, 10]},
124
- "RandomForest": {"n_estimators": [100, 200], "max_depth": [None, 5, 10],
125
- "min_samples_split": [2, 5]},
126
- "BaggedDecisionTree": {"n_estimators": [30, 50, 100]},
127
- }
128
- # ------------------------------
129
- # Grid search + evaluation
130
- # ------------------------------
131
- rows = []
132
- best_name, best_estimator, best_f1 = None, None, -1
133
- for name, model in models.items():
134
- print(f"\n[GRID] Tuning {name} …")
135
- gs = GridSearchCV(model, param_grids[name], scoring="f1", cv=cv,
136
- n_jobs=-1)
137
- gs.fit(X_train, y_train)
138
- y_pred = gs.best_estimator_.predict(X_test)
139
- acc = accuracy_score(y_test, y_pred)
140
- f1 = f1_score(y_test, y_pred)
141
- prec = precision_score(y_test, y_pred)
142
- rec = recall_score(y_test, y_pred)
143
- print(f"[GRID] {name} | best_params={gs.best_params_} | ACC={acc:.4f} F1 = {f1: .4f} P = {prec: .4f} R = {rec: .4f}")
144
- print(classification_report(y_test, y_pred, digits=4))
145
- rows.append({
146
- "Model": name,
147
- "BestParams": gs.best_params_,
148
- "Accuracy": acc,
149
- "F1": f1,
150
- "Precision": prec,
151
- "Recall": rec
152
- })
153
- if f1 > best_f1:
154
- best_f1 = f1
155
- best_estimator = gs.best_estimator_
156
- best_name = name
157
- # Save table reports
158
- # results_df = pd.DataFrame(rows).sort_values(by="F1", ascending=False)
159
- # --- Save model comparison table ---
160
- results_df = pd.DataFrame(rows).sort_values(by="F1", ascending=False)
161
- results_df.to_csv(os.path.join(REPORTS_DIR, "model_comparison.csv"), index=False)
162
-
163
- with open(os.path.join(REPORTS_DIR, "model_comparison.json"), "w") as f:
164
- json.dump(rows, f, indent=4)
165
-
166
- # --- Save plots ---
167
- # barplot_metric(results_df,"Accuracy",os.path.join(PLOTS_DIR, "model_accuracy.png"),"Model Accuracy (tuned)")
168
- #
169
- # barplot_metric(results_df,"F1",os.path.join(PLOTS_DIR, "model_f1.png"),"Model F1 (tuned)")
170
-
171
- # Best model diagnostics
172
- y_best = best_estimator.predict(X_test)
173
- plot_cm(y_test, y_best, f"Confusion Matrix – {best_name}",os.path.join(PLOTS_DIR, "confusion_matrix.png"))
174
- # ROC (if proba available)
175
- if hasattr(best_estimator, "predict_proba"):
176
- y_prob = best_estimator.predict_proba(X_test)[:, 1]
177
- plot_roc(y_test, y_prob, f"ROC – {best_name}", os.path.join(PLOTS_DIR,"roc_curve.png"))
178
- # Save best model
179
- joblib.dump(best_estimator, os.path.join(MODEL_DIR, "best_model.pkl"))
180
- print(f"\n[OK] Saved best model: {best_name} (F1={best_f1:.4f}) -> backend/models / best_model.pkl")
181
-
182
- # ------------------------------
183
- # Gradient analysis (loss & accuracy vs iterations) using SAGA
184
- # ------------------------------
185
- from sklearn.preprocessing import StandardScaler
186
- from sklearn.linear_model import LogisticRegression
187
- from sklearn.metrics import log_loss, accuracy_score
188
- import numpy as np
189
- import os
190
-
191
- # Scale data
192
- scaler = StandardScaler()
193
- X_scaled = scaler.fit_transform(X_clean)
194
- X_train_g, X_test_g, y_train_g, y_test_g = train_test_split(
195
- X_scaled, Y_clean, test_size=0.2, random_state=42, stratify=Y_clean
196
- )
197
-
198
-
199
- def track_training(penalty, max_iter=50):
200
- clf = LogisticRegression(
201
- penalty=penalty,
202
- solver="saga",
203
- warm_start=True, # allows continuing training
204
- max_iter=1, # train one step at a time
205
- random_state=42
206
  )
207
 
208
- losses, accs = [], []
209
- for i in range(max_iter):
210
- clf.fit(X_train_g, y_train_g) # trains 1 iteration per loop
211
- y_pred = clf.predict_proba(X_train_g)
212
- losses.append(log_loss(y_train_g, y_pred))
213
- accs.append(accuracy_score(y_train_g, np.argmax(y_pred, axis=1)))
214
-
215
- return losses, accs
216
-
217
-
218
- # Collect curves
219
- loss_curves, acc_curves = {}, {}
220
- loss_curves["L2"], acc_curves["L2"] = track_training("l2", max_iter=50)
221
- loss_curves["L1"], acc_curves["L1"] = track_training("l1", max_iter=50)
222
-
223
- # Plot curves
224
- lineplot_curves(
225
- loss_curves,
226
- ylabel="Log Loss",
227
- title="Logistic Regression – Loss vs Iterations",
228
- save_path=os.path.join(PLOTS_DIR, "logreg_loss_curves.png")
229
- )
230
-
231
- lineplot_curves(
232
- acc_curves,
233
- ylabel="Training Accuracy",
234
- title="Logistic Regression – Accuracy vs Iterations",
235
- save_path=os.path.join(PLOTS_DIR, "logreg_accuracy_curves.png")
236
- )
237
-
238
- print(f"[OK] Reports saved under: {REPORTS_DIR}")
239
- # Accuracy and F1 bar plots
240
- # barplot_metric(results_df, "Accuracy", os.path.join(PLOTS_DIR, "model_accuracy.png"), "Model Accuracy (tuned)")
241
- # barplot_metric(results_df, "F1", os.path.join(PLOTS_DIR, "model_f1.png"), "Model F1 (tuned)")
242
- # plt.savefig(os.path.join(PLOTS_DIR, "variance_comparison.png"), bbox_inches='tight')
243
- # plt.close()
244
- barplot_metric(results_df, "Accuracy", os.path.join(PLOTS_DIR, "model_accuracy.png"), "Model Accuracy (tuned)")
245
- barplot_metric(results_df, "F1", os.path.join(PLOTS_DIR, "model_f1.png"), "Model F1 (tuned)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
246
 
247
- print(f"[OK] Plots saved -> {PLOTS_DIR}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
  from sklearn.pipeline import Pipeline
14
  from sklearn.preprocessing import StandardScaler
15
  from sklearn.metrics import (
16
+ accuracy_score, f1_score, precision_score, recall_score,
17
+ classification_report, log_loss
18
  )
19
  from sklearn.linear_model import LogisticRegression
20
  from sklearn.tree import DecisionTreeClassifier
 
51
  # raise FileNotFoundError(f"Dataset not found at {DATA_PATH}")
52
 
53
  ### Load with hugging face dataset
54
+ def train_model(MODEL_DIR, REPORTS_DIR, PLOTS_DIR):
55
+ ds = load_dataset("jonathansuru/diabetes")
56
+ df = ds['train'].to_pandas()
57
+ X = df.drop("Outcome", axis=1)
58
+ Y = df["Outcome"].astype(int)
59
+ print(f"[INFO] Loaded dataset: {df.shape[0]} rows, {df.shape[1]} cols")
60
+
61
+ # ------------------------------
62
+ # Outlier removal (z-score)
63
+ # ------------------------------
64
+ z = np.abs(stats.zscore(X))
65
+ non_outlier_mask = (z < 3).all(axis=1)
66
+ X_clean = X[non_outlier_mask]
67
+ Y_clean = Y[non_outlier_mask]
68
+ print(f"[INFO] Outliers removed: {len(X) - len(X_clean)} | Clean size:{len(X_clean)}")
69
+
70
+ # Variance comparison
71
+ var_df = pd.DataFrame({"Before": X.var(), "After": X_clean.var()})
72
+ var_df.to_csv(os.path.join(REPORTS_DIR, "variance_before_after.csv"))
73
+ plt.figure(figsize=(10,5))
74
+ var_df.plot(kind='bar')
75
+ plt.title("Feature Variance: Before vs After Outlier Removal")
76
+ plt.ylabel("Variance")
77
+ plt.xticks(rotation=45, ha='right')
78
+ plt.tight_layout()
79
+ plt.savefig(os.path.join(PLOTS_DIR, "variance_comparison.png"), bbox_inches='tight')
80
+ plt.close()
81
+
82
+ # ------------------------------
83
+ # Split
84
+ # ------------------------------
85
+ X_train, X_test, y_train, y_test = train_test_split(
86
+ X_clean, Y_clean, test_size=0.2, random_state=42, stratify=Y_clean
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  )
88
 
89
+ # ------------------------------
90
+ # Models + grids
91
+ # ------------------------------
92
+ cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
93
+ models = {
94
+ "LogReg_L1": Pipeline([
95
+ ("scaler", StandardScaler()),
96
+ ("clf", LogisticRegression(penalty="l1", solver="liblinear", max_iter=2000))
97
+ ]),
98
+ "LogReg_L2": Pipeline([
99
+ ("scaler", StandardScaler()),
100
+ ("clf", LogisticRegression(penalty="l2", solver="lbfgs", max_iter=2000))
101
+ ]),
102
+ "DecisionTree": DecisionTreeClassifier(random_state=42),
103
+ "RandomForest": RandomForestClassifier(random_state=42),
104
+ "BaggedDecisionTree": BaggingClassifier(
105
+ estimator=DecisionTreeClassifier(random_state=42),
106
+ n_estimators=50,
107
+ random_state=42
108
+ )
109
+ }
110
+ param_grids = {
111
+ "LogReg_L1": {"clf__C": [0.01, 0.1, 1, 10]},
112
+ "LogReg_L2": {"clf__C": [0.01, 0.1, 1, 10]},
113
+ "DecisionTree": {"max_depth": [3,5,7,None], "min_samples_split": [2,5,10]},
114
+ "RandomForest": {"n_estimators": [100,200], "max_depth": [None,5,10], "min_samples_split": [2,5]},
115
+ "BaggedDecisionTree": {"n_estimators": [30,50,100]},
116
+ }
117
+
118
+ # ------------------------------
119
+ # Grid search + evaluation
120
+ # ------------------------------
121
+ rows = []
122
+ best_name, best_estimator, best_f1 = None, None, -1
123
+
124
+ for name, model in models.items():
125
+ print(f"\n[GRID] Tuning {name} ")
126
+ gs = GridSearchCV(model, param_grids[name], scoring="f1", cv=cv, n_jobs=-1)
127
+ gs.fit(X_train, y_train)
128
+ y_pred = gs.best_estimator_.predict(X_test)
129
+ acc = accuracy_score(y_test, y_pred)
130
+ f1 = f1_score(y_test, y_pred)
131
+ prec = precision_score(y_test, y_pred)
132
+ rec = recall_score(y_test, y_pred)
133
+ print(f"[GRID] {name} | best_params={gs.best_params_} | ACC={acc:.4f} F1={f1:.4f} P={prec:.4f} R={rec:.4f}")
134
+ print(classification_report(y_test, y_pred, digits=4))
135
+ rows.append({
136
+ "Model": name,
137
+ "BestParams": gs.best_params_,
138
+ "Accuracy": acc,
139
+ "F1": f1,
140
+ "Precision": prec,
141
+ "Recall": rec
142
+ })
143
+ if f1 > best_f1:
144
+ best_f1 = f1
145
+ best_estimator = gs.best_estimator_
146
+ best_name = name
147
+
148
+ # --- Save model comparison ---
149
+ results_df = pd.DataFrame(rows).sort_values(by="F1", ascending=False)
150
+ results_df.to_csv(os.path.join(REPORTS_DIR, "model_comparison.csv"), index=False)
151
+ with open(os.path.join(REPORTS_DIR, "model_comparison.json"), "w") as f:
152
+ json.dump(rows, f, indent=4)
153
+
154
+ # --- Best model diagnostics ---
155
+ y_best = best_estimator.predict(X_test)
156
+ plot_cm(y_test, y_best, f"Confusion Matrix – {best_name}", os.path.join(PLOTS_DIR, "confusion_matrix.png"))
157
+ if hasattr(best_estimator, "predict_proba"):
158
+ y_prob = best_estimator.predict_proba(X_test)[:,1]
159
+ plot_roc(y_test, y_prob, f"ROC – {best_name}", os.path.join(PLOTS_DIR,"roc_curve.png"))
160
+
161
+ # Save best model
162
+ joblib.dump(best_estimator, os.path.join(MODEL_DIR, "best_model.pkl"))
163
+ print(f"\n[OK] Saved best model: {best_name} (F1={best_f1:.4f}) -> {MODEL_DIR}/best_model.pkl")
164
+
165
+ # ------------------------------
166
+ # Gradient analysis (loss & accuracy vs iterations) using SAGA
167
+ # ------------------------------
168
+ scaler = StandardScaler()
169
+ X_scaled = scaler.fit_transform(X_clean)
170
+ X_train_g, X_test_g, y_train_g, y_test_g = train_test_split(
171
+ X_scaled, Y_clean, test_size=0.2, random_state=42, stratify=Y_clean
172
+ )
173
 
174
+ def track_training(penalty, max_iter=50):
175
+ clf = LogisticRegression(penalty=penalty, solver="saga", warm_start=True, max_iter=1, random_state=42)
176
+ losses, accs = [], []
177
+ for i in range(max_iter):
178
+ clf.fit(X_train_g, y_train_g)
179
+ y_pred = clf.predict_proba(X_train_g)
180
+ losses.append(log_loss(y_train_g, y_pred))
181
+ accs.append(accuracy_score(y_train_g, np.argmax(y_pred, axis=1)))
182
+ return losses, accs
183
+
184
+ loss_curves, acc_curves = {}, {}
185
+ loss_curves["L2"], acc_curves["L2"] = track_training("l2", max_iter=50)
186
+ loss_curves["L1"], acc_curves["L1"] = track_training("l1", max_iter=50)
187
+
188
+ lineplot_curves(loss_curves, ylabel="Log Loss", title="Logistic Regression – Loss vs Iterations",
189
+ save_path=os.path.join(PLOTS_DIR, "logreg_loss_curves.png"))
190
+ lineplot_curves(acc_curves, ylabel="Training Accuracy", title="Logistic Regression – Accuracy vs Iterations",
191
+ save_path=os.path.join(PLOTS_DIR, "logreg_accuracy_curves.png"))
192
+
193
+ print(f"[OK] Reports saved under: {REPORTS_DIR}")
194
+ barplot_metric(results_df, "Accuracy", os.path.join(PLOTS_DIR, "model_accuracy.png"), "Model Accuracy (tuned)")
195
+ barplot_metric(results_df, "F1", os.path.join(PLOTS_DIR, "model_f1.png"), "Model F1 (tuned)")
196
+ print(f"[OK] Plots saved -> {PLOTS_DIR}")