sathishleo commited on
Commit
a2c721f
·
1 Parent(s): 211a2cd

Add app.py, backend, and model for HF Space

Browse files
Files changed (2) hide show
  1. app.py +1 -1
  2. backend/train_model.py +57 -123
app.py CHANGED
@@ -78,7 +78,7 @@ if not os.path.exists(MODEL_PATH):
78
 
79
  if st.button("Train Model"):
80
  st.info("Training started...")
81
- model = train_model(MODEL_PATH, REPORTS_DIR, PLOTS_DIR)
82
  joblib.dump(model, MODEL_PATH)
83
  st.success(f"Model trained and saved to {MODEL_PATH}")
84
  elif page == "Predict":
 
78
 
79
  if st.button("Train Model"):
80
  st.info("Training started...")
81
+ model = train_model()
82
  joblib.dump(model, MODEL_PATH)
83
  st.success(f"Model trained and saved to {MODEL_PATH}")
84
  elif page == "Predict":
backend/train_model.py CHANGED
@@ -1,197 +1,131 @@
1
- import os, json, warnings
2
-
3
- from datasets import load_dataset
4
-
5
- warnings.filterwarnings("ignore")
6
  import numpy as np
7
  import pandas as pd
8
  import joblib
9
  from scipy import stats
10
  import matplotlib.pyplot as plt
11
- import seaborn as sns
12
- from sklearn.model_selection import train_test_split, GridSearchCV,StratifiedKFold
 
13
  from sklearn.pipeline import Pipeline
14
  from sklearn.preprocessing import StandardScaler
15
- from sklearn.metrics import (
16
- accuracy_score, f1_score, precision_score, recall_score,
17
- classification_report, log_loss
18
- )
19
  from sklearn.linear_model import LogisticRegression
20
  from sklearn.tree import DecisionTreeClassifier
21
  from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
22
- from .utils import (
23
- ensure_dirs, save_json, plot_cm, plot_roc, barplot_metric,
24
- lineplot_curves
25
- )
26
 
 
 
 
27
 
28
  # ------------------------------
29
- # Paths
30
  # ------------------------------
31
- # DATA_PATH = os.path.join(ROOT, "data", "diabetes.csv")
32
- # ROOT = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
33
- # DATA_PATH = os.path.join(ROOT, "data", "diabetes.csv")
34
- # Use shared folders at project root
35
- MODEL_DIR = "models" # volume for models
36
- REPORTS_DIR = "reports" # volume for reports
37
  PLOTS_DIR = os.path.join(REPORTS_DIR, "plots")
38
 
39
-
40
- # Make sure folders exist
41
  os.makedirs(MODEL_DIR, exist_ok=True)
42
  os.makedirs(REPORTS_DIR, exist_ok=True)
43
  os.makedirs(PLOTS_DIR, exist_ok=True)
44
 
45
 
46
- # def train_and_get_model():
47
- # """Train the model (or reload if exists) and return it"""
48
- # ------------------------------
49
- # Load data
50
- # ------------------------------
51
- # if not os.path.exists(DATA_PATH):
52
- # raise FileNotFoundError(f"Dataset not found at {DATA_PATH}")
53
-
54
- ### Load with hugging face dataset
55
- def train_model(MODEL_DIR, REPORTS_DIR, PLOTS_DIR):
56
  ds = load_dataset("jonathansuru/diabetes")
57
- df = ds['train'].to_pandas()
58
  X = df.drop("Outcome", axis=1)
59
  Y = df["Outcome"].astype(int)
60
  print(f"[INFO] Loaded dataset: {df.shape[0]} rows, {df.shape[1]} cols")
61
 
62
- # ------------------------------
63
- # Outlier removal (z-score)
64
- # ------------------------------
65
  z = np.abs(stats.zscore(X))
66
- non_outlier_mask = (z < 3).all(axis=1)
67
- X_clean = X[non_outlier_mask]
68
- Y_clean = Y[non_outlier_mask]
69
- print(f"[INFO] Outliers removed: {len(X) - len(X_clean)} | Clean size:{len(X_clean)}")
70
 
71
- # Variance comparison
72
  var_df = pd.DataFrame({"Before": X.var(), "After": X_clean.var()})
73
  var_df.to_csv(os.path.join(REPORTS_DIR, "variance_before_after.csv"))
74
  plt.figure(figsize=(10,5))
75
- var_df.plot(kind='bar')
76
  plt.title("Feature Variance: Before vs After Outlier Removal")
77
  plt.ylabel("Variance")
78
- plt.xticks(rotation=45, ha='right')
79
  plt.tight_layout()
80
- plt.savefig(os.path.join(PLOTS_DIR, "variance_comparison.png"), bbox_inches='tight')
81
  plt.close()
82
 
83
- # ------------------------------
84
- # Split
85
- # ------------------------------
86
  X_train, X_test, y_train, y_test = train_test_split(
87
  X_clean, Y_clean, test_size=0.2, random_state=42, stratify=Y_clean
88
  )
89
 
90
- # ------------------------------
91
- # Models + grids
92
- # ------------------------------
93
  cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
94
  models = {
95
- "LogReg_L1": Pipeline([
96
- ("scaler", StandardScaler()),
97
- ("clf", LogisticRegression(penalty="l1", solver="liblinear", max_iter=2000))
98
- ]),
99
- "LogReg_L2": Pipeline([
100
- ("scaler", StandardScaler()),
101
- ("clf", LogisticRegression(penalty="l2", solver="lbfgs", max_iter=2000))
102
- ]),
103
  "DecisionTree": DecisionTreeClassifier(random_state=42),
104
  "RandomForest": RandomForestClassifier(random_state=42),
105
- "BaggedDecisionTree": BaggingClassifier(
106
- estimator=DecisionTreeClassifier(random_state=42),
107
- n_estimators=50,
108
- random_state=42
109
- )
110
  }
 
111
  param_grids = {
112
  "LogReg_L1": {"clf__C": [0.01, 0.1, 1, 10]},
113
  "LogReg_L2": {"clf__C": [0.01, 0.1, 1, 10]},
114
  "DecisionTree": {"max_depth": [3,5,7,None], "min_samples_split": [2,5,10]},
115
  "RandomForest": {"n_estimators": [100,200], "max_depth": [None,5,10], "min_samples_split": [2,5]},
116
- "BaggedDecisionTree": {"n_estimators": [30,50,100]},
117
  }
118
 
119
- # ------------------------------
120
  # Grid search + evaluation
121
- # ------------------------------
122
  rows = []
123
  best_name, best_estimator, best_f1 = None, None, -1
124
-
125
  for name, model in models.items():
126
- print(f"\n[GRID] Tuning {name} …")
127
  gs = GridSearchCV(model, param_grids[name], scoring="f1", cv=cv, n_jobs=-1)
128
  gs.fit(X_train, y_train)
129
  y_pred = gs.best_estimator_.predict(X_test)
130
- acc = accuracy_score(y_test, y_pred)
131
- f1 = f1_score(y_test, y_pred)
132
- prec = precision_score(y_test, y_pred)
133
- rec = recall_score(y_test, y_pred)
134
  print(f"[GRID] {name} | best_params={gs.best_params_} | ACC={acc:.4f} F1={f1:.4f} P={prec:.4f} R={rec:.4f}")
135
- print(classification_report(y_test, y_pred, digits=4))
136
- rows.append({
137
- "Model": name,
138
- "BestParams": gs.best_params_,
139
- "Accuracy": acc,
140
- "F1": f1,
141
- "Precision": prec,
142
- "Recall": rec
143
- })
144
  if f1 > best_f1:
145
- best_f1 = f1
146
- best_estimator = gs.best_estimator_
147
- best_name = name
148
 
149
- # --- Save model comparison ---
150
  results_df = pd.DataFrame(rows).sort_values(by="F1", ascending=False)
151
  results_df.to_csv(os.path.join(REPORTS_DIR, "model_comparison.csv"), index=False)
152
  with open(os.path.join(REPORTS_DIR, "model_comparison.json"), "w") as f:
153
  json.dump(rows, f, indent=4)
154
 
155
- # --- Best model diagnostics ---
 
 
 
 
156
  y_best = best_estimator.predict(X_test)
157
  plot_cm(y_test, y_best, f"Confusion Matrix – {best_name}", os.path.join(PLOTS_DIR, "confusion_matrix.png"))
 
158
  if hasattr(best_estimator, "predict_proba"):
159
- y_prob = best_estimator.predict_proba(X_test)[:,1]
160
  plot_roc(y_test, y_prob, f"ROC – {best_name}", os.path.join(PLOTS_DIR,"roc_curve.png"))
161
 
162
  # Save best model
163
- joblib.dump(best_estimator, os.path.join(MODEL_DIR, "best_model.pkl"))
164
- print(f"\n[OK] Saved best model: {best_name} (F1={best_f1:.4f}) -> {MODEL_DIR}/best_model.pkl")
165
-
166
- # ------------------------------
167
- # Gradient analysis (loss & accuracy vs iterations) using SAGA
168
- # ------------------------------
169
- scaler = StandardScaler()
170
- X_scaled = scaler.fit_transform(X_clean)
171
- X_train_g, X_test_g, y_train_g, y_test_g = train_test_split(
172
- X_scaled, Y_clean, test_size=0.2, random_state=42, stratify=Y_clean
173
- )
174
 
175
- def track_training(penalty, max_iter=50):
176
- clf = LogisticRegression(penalty=penalty, solver="saga", warm_start=True, max_iter=1, random_state=42)
177
- losses, accs = [], []
178
- for i in range(max_iter):
179
- clf.fit(X_train_g, y_train_g)
180
- y_pred = clf.predict_proba(X_train_g)
181
- losses.append(log_loss(y_train_g, y_pred))
182
- accs.append(accuracy_score(y_train_g, np.argmax(y_pred, axis=1)))
183
- return losses, accs
184
-
185
- loss_curves, acc_curves = {}, {}
186
- loss_curves["L2"], acc_curves["L2"] = track_training("l2", max_iter=50)
187
- loss_curves["L1"], acc_curves["L1"] = track_training("l1", max_iter=50)
188
-
189
- lineplot_curves(loss_curves, ylabel="Log Loss", title="Logistic Regression – Loss vs Iterations",
190
- save_path=os.path.join(PLOTS_DIR, "logreg_loss_curves.png"))
191
- lineplot_curves(acc_curves, ylabel="Training Accuracy", title="Logistic Regression – Accuracy vs Iterations",
192
- save_path=os.path.join(PLOTS_DIR, "logreg_accuracy_curves.png"))
193
-
194
- print(f"[OK] Reports saved under: {REPORTS_DIR}")
195
- barplot_metric(results_df, "Accuracy", os.path.join(PLOTS_DIR, "model_accuracy.png"), "Model Accuracy (tuned)")
196
- barplot_metric(results_df, "F1", os.path.join(PLOTS_DIR, "model_f1.png"), "Model F1 (tuned)")
197
- print(f"[OK] Plots saved -> {PLOTS_DIR}")
 
1
+ import os
2
+ import json
3
+ import warnings
 
 
4
  import numpy as np
5
  import pandas as pd
6
  import joblib
7
  from scipy import stats
8
  import matplotlib.pyplot as plt
9
+ from datasets import load_dataset
10
+
11
+ from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
12
  from sklearn.pipeline import Pipeline
13
  from sklearn.preprocessing import StandardScaler
14
+ from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
 
 
 
15
  from sklearn.linear_model import LogisticRegression
16
  from sklearn.tree import DecisionTreeClassifier
17
  from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
 
 
 
 
18
 
19
+ from utils import ensure_dirs, save_json, plot_cm, plot_roc, barplot_metric, lineplot_curves
20
+
21
+ warnings.filterwarnings("ignore")
22
 
23
  # ------------------------------
24
+ # Base paths
25
  # ------------------------------
26
+ BASE_DIR = os.getcwd() # repo folder in Hugging Face Spaces
27
+ MODEL_DIR = os.path.join(BASE_DIR, "models")
28
+ REPORTS_DIR = os.path.join(BASE_DIR, "reports")
 
 
 
29
  PLOTS_DIR = os.path.join(REPORTS_DIR, "plots")
30
 
31
+ # Ensure folders exist
 
32
  os.makedirs(MODEL_DIR, exist_ok=True)
33
  os.makedirs(REPORTS_DIR, exist_ok=True)
34
  os.makedirs(PLOTS_DIR, exist_ok=True)
35
 
36
 
37
+ def train_model():
38
+ # Load dataset
 
 
 
 
 
 
 
 
39
  ds = load_dataset("jonathansuru/diabetes")
40
+ df = ds["train"].to_pandas()
41
  X = df.drop("Outcome", axis=1)
42
  Y = df["Outcome"].astype(int)
43
  print(f"[INFO] Loaded dataset: {df.shape[0]} rows, {df.shape[1]} cols")
44
 
45
+ # Outlier removal
 
 
46
  z = np.abs(stats.zscore(X))
47
+ mask = (z < 3).all(axis=1)
48
+ X_clean, Y_clean = X[mask], Y[mask]
49
+ print(f"[INFO] Outliers removed: {len(X) - len(X_clean)} | Clean size: {len(X_clean)}")
 
50
 
51
+ # Save variance comparison
52
  var_df = pd.DataFrame({"Before": X.var(), "After": X_clean.var()})
53
  var_df.to_csv(os.path.join(REPORTS_DIR, "variance_before_after.csv"))
54
  plt.figure(figsize=(10,5))
55
+ var_df.plot(kind="bar")
56
  plt.title("Feature Variance: Before vs After Outlier Removal")
57
  plt.ylabel("Variance")
58
+ plt.xticks(rotation=45, ha="right")
59
  plt.tight_layout()
60
+ plt.savefig(os.path.join(PLOTS_DIR, "variance_comparison.png"), bbox_inches="tight")
61
  plt.close()
62
 
63
+ # Train/test split
 
 
64
  X_train, X_test, y_train, y_test = train_test_split(
65
  X_clean, Y_clean, test_size=0.2, random_state=42, stratify=Y_clean
66
  )
67
 
68
+ # Models and parameter grids
 
 
69
  cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
70
  models = {
71
+ "LogReg_L1": Pipeline([("scaler", StandardScaler()),
72
+ ("clf", LogisticRegression(penalty="l1", solver="liblinear", max_iter=2000))]),
73
+ "LogReg_L2": Pipeline([("scaler", StandardScaler()),
74
+ ("clf", LogisticRegression(penalty="l2", solver="lbfgs", max_iter=2000))]),
 
 
 
 
75
  "DecisionTree": DecisionTreeClassifier(random_state=42),
76
  "RandomForest": RandomForestClassifier(random_state=42),
77
+ "BaggedDecisionTree": BaggingClassifier(DecisionTreeClassifier(random_state=42),
78
+ n_estimators=50, random_state=42)
 
 
 
79
  }
80
+
81
  param_grids = {
82
  "LogReg_L1": {"clf__C": [0.01, 0.1, 1, 10]},
83
  "LogReg_L2": {"clf__C": [0.01, 0.1, 1, 10]},
84
  "DecisionTree": {"max_depth": [3,5,7,None], "min_samples_split": [2,5,10]},
85
  "RandomForest": {"n_estimators": [100,200], "max_depth": [None,5,10], "min_samples_split": [2,5]},
86
+ "BaggedDecisionTree": {"n_estimators": [30,50,100]}
87
  }
88
 
 
89
  # Grid search + evaluation
 
90
  rows = []
91
  best_name, best_estimator, best_f1 = None, None, -1
 
92
  for name, model in models.items():
93
+ print(f"[GRID] Tuning {name} …")
94
  gs = GridSearchCV(model, param_grids[name], scoring="f1", cv=cv, n_jobs=-1)
95
  gs.fit(X_train, y_train)
96
  y_pred = gs.best_estimator_.predict(X_test)
97
+ acc, f1 = accuracy_score(y_test, y_pred), f1_score(y_test, y_pred)
98
+ prec, rec = precision_score(y_test, y_pred), recall_score(y_test, y_pred)
 
 
99
  print(f"[GRID] {name} | best_params={gs.best_params_} | ACC={acc:.4f} F1={f1:.4f} P={prec:.4f} R={rec:.4f}")
100
+ rows.append({"Model": name, "BestParams": gs.best_params_, "Accuracy": acc, "F1": f1,
101
+ "Precision": prec, "Recall": rec})
 
 
 
 
 
 
 
102
  if f1 > best_f1:
103
+ best_f1, best_estimator, best_name = f1, gs.best_estimator_, name
 
 
104
 
105
+ # Save model comparison
106
  results_df = pd.DataFrame(rows).sort_values(by="F1", ascending=False)
107
  results_df.to_csv(os.path.join(REPORTS_DIR, "model_comparison.csv"), index=False)
108
  with open(os.path.join(REPORTS_DIR, "model_comparison.json"), "w") as f:
109
  json.dump(rows, f, indent=4)
110
 
111
+ # Plot Accuracy and F1 barplots
112
+ barplot_metric(results_df, "Accuracy", os.path.join(PLOTS_DIR, "model_accuracy.png"), "Model Accuracy (tuned)")
113
+ barplot_metric(results_df, "F1", os.path.join(PLOTS_DIR, "model_f1.png"), "Model F1 (tuned)")
114
+
115
+ # Best model diagnostics
116
  y_best = best_estimator.predict(X_test)
117
  plot_cm(y_test, y_best, f"Confusion Matrix – {best_name}", os.path.join(PLOTS_DIR, "confusion_matrix.png"))
118
+
119
  if hasattr(best_estimator, "predict_proba"):
120
+ y_prob = best_estimator.predict_proba(X_test)[:, 1]
121
  plot_roc(y_test, y_prob, f"ROC – {best_name}", os.path.join(PLOTS_DIR,"roc_curve.png"))
122
 
123
  # Save best model
124
+ model_path = os.path.join(MODEL_DIR, "best_model.pkl")
125
+ joblib.dump(best_estimator, model_path)
 
 
 
 
 
 
 
 
 
126
 
127
+ print(f"[OK] Best model ({best_name}) saved with F1={best_f1:.4f}")
128
+ print(f"[OK] All plots saved -> {PLOTS_DIR}")
129
+ print(f"[OK] Reports saved -> {REPORTS_DIR}")
130
+
131
+ return best_estimator