sathishleo commited on
Commit
2419e97
·
1 Parent(s): 59ebef0

Add app.py, backend, and model for HF Space

Browse files
Files changed (1) hide show
  1. backend/train_model.py +60 -49
backend/train_model.py CHANGED
@@ -1,17 +1,20 @@
1
  import os
2
  import json
3
  import warnings
 
4
  import numpy as np
5
  import pandas as pd
6
  import joblib
7
- from scipy import stats
8
  import matplotlib.pyplot as plt
9
  from datasets import load_dataset
 
10
 
11
  from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
12
  from sklearn.pipeline import Pipeline
13
  from sklearn.preprocessing import StandardScaler
14
- from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
 
 
15
  from sklearn.linear_model import LogisticRegression
16
  from sklearn.tree import DecisionTreeClassifier
17
  from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
@@ -35,24 +38,27 @@ os.makedirs(PLOTS_DIR, exist_ok=True)
35
 
36
 
37
  def train_model():
 
38
  # Load dataset
 
39
  ds = load_dataset("jonathansuru/diabetes")
40
  df = ds["train"].to_pandas()
41
  X = df.drop("Outcome", axis=1)
42
  Y = df["Outcome"].astype(int)
43
  print(f"[INFO] Loaded dataset: {df.shape[0]} rows, {df.shape[1]} cols")
44
 
 
45
  # Outlier removal
 
46
  z = np.abs(stats.zscore(X))
47
  mask = (z < 3).all(axis=1)
48
  X_clean, Y_clean = X[mask], Y[mask]
49
  print(f"[INFO] Outliers removed: {len(X) - len(X_clean)} | Clean size: {len(X_clean)}")
50
 
51
- # Save variance comparison
52
  var_df = pd.DataFrame({"Before": X.var(), "After": X_clean.var()})
53
  var_df.to_csv(os.path.join(REPORTS_DIR, "variance_before_after.csv"))
54
- plt.figure(figsize=(10,5))
55
- var_df.plot(kind="bar")
56
  plt.title("Feature Variance: Before vs After Outlier Removal")
57
  plt.ylabel("Variance")
58
  plt.xticks(rotation=45, ha="right")
@@ -60,33 +66,44 @@ def train_model():
60
  plt.savefig(os.path.join(PLOTS_DIR, "variance_comparison.png"), bbox_inches="tight")
61
  plt.close()
62
 
 
63
  # Train/test split
 
64
  X_train, X_test, y_train, y_test = train_test_split(
65
  X_clean, Y_clean, test_size=0.2, random_state=42, stratify=Y_clean
66
  )
67
 
68
- # Models and parameter grids
 
 
69
  cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
70
  models = {
71
- "LogReg_L1": Pipeline([("scaler", StandardScaler()),
72
- ("clf", LogisticRegression(penalty="l1", solver="liblinear", max_iter=2000))]),
73
- "LogReg_L2": Pipeline([("scaler", StandardScaler()),
74
- ("clf", LogisticRegression(penalty="l2", solver="lbfgs", max_iter=2000))]),
 
 
 
 
75
  "DecisionTree": DecisionTreeClassifier(random_state=42),
76
  "RandomForest": RandomForestClassifier(random_state=42),
77
- "BaggedDecisionTree": BaggingClassifier(DecisionTreeClassifier(random_state=42),
78
- n_estimators=50, random_state=42)
 
79
  }
80
 
81
  param_grids = {
82
  "LogReg_L1": {"clf__C": [0.01, 0.1, 1, 10]},
83
  "LogReg_L2": {"clf__C": [0.01, 0.1, 1, 10]},
84
- "DecisionTree": {"max_depth": [3,5,7,None], "min_samples_split": [2,5,10]},
85
- "RandomForest": {"n_estimators": [100,200], "max_depth": [None,5,10], "min_samples_split": [2,5]},
86
- "BaggedDecisionTree": {"n_estimators": [30,50,100]}
87
  }
88
 
 
89
  # Grid search + evaluation
 
90
  rows = []
91
  best_name, best_estimator, best_f1 = None, None, -1
92
  for name, model in models.items():
@@ -94,11 +111,22 @@ def train_model():
94
  gs = GridSearchCV(model, param_grids[name], scoring="f1", cv=cv, n_jobs=-1)
95
  gs.fit(X_train, y_train)
96
  y_pred = gs.best_estimator_.predict(X_test)
97
- acc, f1 = accuracy_score(y_test, y_pred), f1_score(y_test, y_pred)
98
- prec, rec = precision_score(y_test, y_pred), recall_score(y_test, y_pred)
 
 
 
 
99
  print(f"[GRID] {name} | best_params={gs.best_params_} | ACC={acc:.4f} F1={f1:.4f} P={prec:.4f} R={rec:.4f}")
100
- rows.append({"Model": name, "BestParams": gs.best_params_, "Accuracy": acc, "F1": f1,
101
- "Precision": prec, "Recall": rec})
 
 
 
 
 
 
 
102
  if f1 > best_f1:
103
  best_f1, best_estimator, best_name = f1, gs.best_estimator_, name
104
 
@@ -112,28 +140,23 @@ def train_model():
112
  barplot_metric(results_df, "Accuracy", os.path.join(PLOTS_DIR, "model_accuracy.png"), "Model Accuracy (tuned)")
113
  barplot_metric(results_df, "F1", os.path.join(PLOTS_DIR, "model_f1.png"), "Model F1 (tuned)")
114
 
 
115
  # Best model diagnostics
 
116
  y_best = best_estimator.predict(X_test)
117
  plot_cm(y_test, y_best, f"Confusion Matrix – {best_name}", os.path.join(PLOTS_DIR, "confusion_matrix.png"))
118
 
119
  if hasattr(best_estimator, "predict_proba"):
120
  y_prob = best_estimator.predict_proba(X_test)[:, 1]
121
- plot_roc(y_test, y_prob, f"ROC – {best_name}", os.path.join(PLOTS_DIR,"roc_curve.png"))
122
 
123
  # Save best model
124
- model_path = os.path.join(MODEL_DIR, "best_model.pkl")
125
- joblib.dump(best_estimator, model_path)
126
-
127
  print(f"[OK] Best model ({best_name}) saved with F1={best_f1:.4f}")
128
- print(f"[OK] All plots saved -> {PLOTS_DIR}")
129
- print(f"[OK] Reports saved -> {REPORTS_DIR}")
130
- from sklearn.preprocessing import StandardScaler
131
- from sklearn.linear_model import LogisticRegression
132
- from sklearn.metrics import log_loss, accuracy_score
133
- import numpy as np
134
- import os
135
 
136
- # Scale data
 
 
137
  scaler = StandardScaler()
138
  X_scaled = scaler.fit_transform(X_clean)
139
  X_train_g, X_test_g, y_train_g, y_test_g = train_test_split(
@@ -144,33 +167,29 @@ def train_model():
144
  clf = LogisticRegression(
145
  penalty=penalty,
146
  solver="saga",
147
- warm_start=True, # allows continuing training
148
- max_iter=1, # train one step at a time
149
  random_state=42
150
  )
151
 
152
  losses, accs = [], []
153
- for i in range(max_iter):
154
- clf.fit(X_train_g, y_train_g) # trains 1 iteration per loop
155
  y_pred = clf.predict_proba(X_train_g)
156
  losses.append(log_loss(y_train_g, y_pred))
157
  accs.append(accuracy_score(y_train_g, np.argmax(y_pred, axis=1)))
158
-
159
  return losses, accs
160
 
161
- # Collect curves
162
  loss_curves, acc_curves = {}, {}
163
  loss_curves["L2"], acc_curves["L2"] = track_training("l2", max_iter=50)
164
  loss_curves["L1"], acc_curves["L1"] = track_training("l1", max_iter=50)
165
 
166
- # Plot curves
167
  lineplot_curves(
168
  loss_curves,
169
  ylabel="Log Loss",
170
  title="Logistic Regression – Loss vs Iterations",
171
  save_path=os.path.join(PLOTS_DIR, "logreg_loss_curves.png")
172
  )
173
-
174
  lineplot_curves(
175
  acc_curves,
176
  ylabel="Training Accuracy",
@@ -178,15 +197,7 @@ def train_model():
178
  save_path=os.path.join(PLOTS_DIR, "logreg_accuracy_curves.png")
179
  )
180
 
181
- print(f"[OK] Reports saved under: {REPORTS_DIR}")
182
- # Accuracy and F1 bar plots
183
- # barplot_metric(results_df, "Accuracy", os.path.join(PLOTS_DIR, "model_accuracy.png"), "Model Accuracy (tuned)")
184
- # barplot_metric(results_df, "F1", os.path.join(PLOTS_DIR, "model_f1.png"), "Model F1 (tuned)")
185
- # plt.savefig(os.path.join(PLOTS_DIR, "variance_comparison.png"), bbox_inches='tight')
186
- # plt.close()
187
- barplot_metric(results_df, "Accuracy", os.path.join(PLOTS_DIR, "model_accuracy.png"), "Model Accuracy (tuned)")
188
- barplot_metric(results_df, "F1", os.path.join(PLOTS_DIR, "model_f1.png"), "Model F1 (tuned)")
189
-
190
- print(f"[OK] Plots saved -> {PLOTS_DIR}")
191
 
192
  return best_estimator
 
1
  import os
2
  import json
3
  import warnings
4
+
5
  import numpy as np
6
  import pandas as pd
7
  import joblib
 
8
  import matplotlib.pyplot as plt
9
  from datasets import load_dataset
10
+ from scipy import stats
11
 
12
  from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
13
  from sklearn.pipeline import Pipeline
14
  from sklearn.preprocessing import StandardScaler
15
+ from sklearn.metrics import (
16
+ accuracy_score, f1_score, precision_score, recall_score, classification_report, log_loss
17
+ )
18
  from sklearn.linear_model import LogisticRegression
19
  from sklearn.tree import DecisionTreeClassifier
20
  from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
 
38
 
39
 
40
  def train_model():
41
+ # ------------------------------
42
  # Load dataset
43
+ # ------------------------------
44
  ds = load_dataset("jonathansuru/diabetes")
45
  df = ds["train"].to_pandas()
46
  X = df.drop("Outcome", axis=1)
47
  Y = df["Outcome"].astype(int)
48
  print(f"[INFO] Loaded dataset: {df.shape[0]} rows, {df.shape[1]} cols")
49
 
50
+ # ------------------------------
51
  # Outlier removal
52
+ # ------------------------------
53
  z = np.abs(stats.zscore(X))
54
  mask = (z < 3).all(axis=1)
55
  X_clean, Y_clean = X[mask], Y[mask]
56
  print(f"[INFO] Outliers removed: {len(X) - len(X_clean)} | Clean size: {len(X_clean)}")
57
 
58
+ # Save variance comparison plot
59
  var_df = pd.DataFrame({"Before": X.var(), "After": X_clean.var()})
60
  var_df.to_csv(os.path.join(REPORTS_DIR, "variance_before_after.csv"))
61
+ var_df.plot(kind="bar", figsize=(10, 5))
 
62
  plt.title("Feature Variance: Before vs After Outlier Removal")
63
  plt.ylabel("Variance")
64
  plt.xticks(rotation=45, ha="right")
 
66
  plt.savefig(os.path.join(PLOTS_DIR, "variance_comparison.png"), bbox_inches="tight")
67
  plt.close()
68
 
69
+ # ------------------------------
70
  # Train/test split
71
+ # ------------------------------
72
  X_train, X_test, y_train, y_test = train_test_split(
73
  X_clean, Y_clean, test_size=0.2, random_state=42, stratify=Y_clean
74
  )
75
 
76
+ # ------------------------------
77
+ # Models and hyperparameters
78
+ # ------------------------------
79
  cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
80
  models = {
81
+ "LogReg_L1": Pipeline([
82
+ ("scaler", StandardScaler()),
83
+ ("clf", LogisticRegression(penalty="l1", solver="liblinear", max_iter=2000))
84
+ ]),
85
+ "LogReg_L2": Pipeline([
86
+ ("scaler", StandardScaler()),
87
+ ("clf", LogisticRegression(penalty="l2", solver="lbfgs", max_iter=2000))
88
+ ]),
89
  "DecisionTree": DecisionTreeClassifier(random_state=42),
90
  "RandomForest": RandomForestClassifier(random_state=42),
91
+ "BaggedDecisionTree": BaggingClassifier(
92
+ DecisionTreeClassifier(random_state=42), n_estimators=50, random_state=42
93
+ )
94
  }
95
 
96
  param_grids = {
97
  "LogReg_L1": {"clf__C": [0.01, 0.1, 1, 10]},
98
  "LogReg_L2": {"clf__C": [0.01, 0.1, 1, 10]},
99
+ "DecisionTree": {"max_depth": [3, 5, 7, None], "min_samples_split": [2, 5, 10]},
100
+ "RandomForest": {"n_estimators": [100, 200], "max_depth": [None, 5, 10], "min_samples_split": [2, 5]},
101
+ "BaggedDecisionTree": {"n_estimators": [30, 50, 100]}
102
  }
103
 
104
+ # ------------------------------
105
  # Grid search + evaluation
106
+ # ------------------------------
107
  rows = []
108
  best_name, best_estimator, best_f1 = None, None, -1
109
  for name, model in models.items():
 
111
  gs = GridSearchCV(model, param_grids[name], scoring="f1", cv=cv, n_jobs=-1)
112
  gs.fit(X_train, y_train)
113
  y_pred = gs.best_estimator_.predict(X_test)
114
+
115
+ acc = accuracy_score(y_test, y_pred)
116
+ f1 = f1_score(y_test, y_pred)
117
+ prec = precision_score(y_test, y_pred)
118
+ rec = recall_score(y_test, y_pred)
119
+
120
  print(f"[GRID] {name} | best_params={gs.best_params_} | ACC={acc:.4f} F1={f1:.4f} P={prec:.4f} R={rec:.4f}")
121
+ rows.append({
122
+ "Model": name,
123
+ "BestParams": gs.best_params_,
124
+ "Accuracy": acc,
125
+ "F1": f1,
126
+ "Precision": prec,
127
+ "Recall": rec
128
+ })
129
+
130
  if f1 > best_f1:
131
  best_f1, best_estimator, best_name = f1, gs.best_estimator_, name
132
 
 
140
  barplot_metric(results_df, "Accuracy", os.path.join(PLOTS_DIR, "model_accuracy.png"), "Model Accuracy (tuned)")
141
  barplot_metric(results_df, "F1", os.path.join(PLOTS_DIR, "model_f1.png"), "Model F1 (tuned)")
142
 
143
+ # ------------------------------
144
  # Best model diagnostics
145
+ # ------------------------------
146
  y_best = best_estimator.predict(X_test)
147
  plot_cm(y_test, y_best, f"Confusion Matrix – {best_name}", os.path.join(PLOTS_DIR, "confusion_matrix.png"))
148
 
149
  if hasattr(best_estimator, "predict_proba"):
150
  y_prob = best_estimator.predict_proba(X_test)[:, 1]
151
+ plot_roc(y_test, y_prob, f"ROC – {best_name}", os.path.join(PLOTS_DIR, "roc_curve.png"))
152
 
153
  # Save best model
154
+ joblib.dump(best_estimator, os.path.join(MODEL_DIR, "best_model.pkl"))
 
 
155
  print(f"[OK] Best model ({best_name}) saved with F1={best_f1:.4f}")
 
 
 
 
 
 
 
156
 
157
+ # ------------------------------
158
+ # Logistic Regression loss/accuracy curves
159
+ # ------------------------------
160
  scaler = StandardScaler()
161
  X_scaled = scaler.fit_transform(X_clean)
162
  X_train_g, X_test_g, y_train_g, y_test_g = train_test_split(
 
167
  clf = LogisticRegression(
168
  penalty=penalty,
169
  solver="saga",
170
+ warm_start=True,
171
+ max_iter=1,
172
  random_state=42
173
  )
174
 
175
  losses, accs = [], []
176
+ for _ in range(max_iter):
177
+ clf.fit(X_train_g, y_train_g)
178
  y_pred = clf.predict_proba(X_train_g)
179
  losses.append(log_loss(y_train_g, y_pred))
180
  accs.append(accuracy_score(y_train_g, np.argmax(y_pred, axis=1)))
 
181
  return losses, accs
182
 
 
183
  loss_curves, acc_curves = {}, {}
184
  loss_curves["L2"], acc_curves["L2"] = track_training("l2", max_iter=50)
185
  loss_curves["L1"], acc_curves["L1"] = track_training("l1", max_iter=50)
186
 
 
187
  lineplot_curves(
188
  loss_curves,
189
  ylabel="Log Loss",
190
  title="Logistic Regression – Loss vs Iterations",
191
  save_path=os.path.join(PLOTS_DIR, "logreg_loss_curves.png")
192
  )
 
193
  lineplot_curves(
194
  acc_curves,
195
  ylabel="Training Accuracy",
 
197
  save_path=os.path.join(PLOTS_DIR, "logreg_accuracy_curves.png")
198
  )
199
 
200
+ print(f"[OK] All plots saved -> {PLOTS_DIR}")
201
+ print(f"[OK] Reports saved -> {REPORTS_DIR}")
 
 
 
 
 
 
 
 
202
 
203
  return best_estimator