Spaces:

sathishleo
/

mlmodels

Sleeping

App Files Files Community

sathishleo commited on Aug 24, 2025

Commit

2419e97

1 Parent(s): 59ebef0

Add app.py, backend, and model for HF Space

Browse files

Files changed (1) hide show

backend/train_model.py +60 -49

backend/train_model.py CHANGED Viewed

@@ -1,17 +1,20 @@
 import os
 import json
 import warnings
 import numpy as np
 import pandas as pd
 import joblib
-from scipy import stats
 import matplotlib.pyplot as plt
 from datasets import load_dataset
 from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
 from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import StandardScaler
-from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report
 from sklearn.linear_model import LogisticRegression
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
@@ -35,24 +38,27 @@ os.makedirs(PLOTS_DIR, exist_ok=True)
 def train_model():
     # Load dataset
     ds = load_dataset("jonathansuru/diabetes")
     df = ds["train"].to_pandas()
     X = df.drop("Outcome", axis=1)
     Y = df["Outcome"].astype(int)
     print(f"[INFO] Loaded dataset: {df.shape[0]} rows, {df.shape[1]} cols")
     # Outlier removal
     z = np.abs(stats.zscore(X))
     mask = (z < 3).all(axis=1)
     X_clean, Y_clean = X[mask], Y[mask]
     print(f"[INFO] Outliers removed: {len(X) - len(X_clean)} | Clean size: {len(X_clean)}")
-    # Save variance comparison
     var_df = pd.DataFrame({"Before": X.var(), "After": X_clean.var()})
     var_df.to_csv(os.path.join(REPORTS_DIR, "variance_before_after.csv"))
-    plt.figure(figsize=(10,5))
-    var_df.plot(kind="bar")
     plt.title("Feature Variance: Before vs After Outlier Removal")
     plt.ylabel("Variance")
     plt.xticks(rotation=45, ha="right")
@@ -60,33 +66,44 @@ def train_model():
     plt.savefig(os.path.join(PLOTS_DIR, "variance_comparison.png"), bbox_inches="tight")
     plt.close()
     # Train/test split
     X_train, X_test, y_train, y_test = train_test_split(
         X_clean, Y_clean, test_size=0.2, random_state=42, stratify=Y_clean
     )
-    # Models and parameter grids
     cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
     models = {
-        "LogReg_L1": Pipeline([("scaler", StandardScaler()),
-                               ("clf", LogisticRegression(penalty="l1", solver="liblinear", max_iter=2000))]),
-        "LogReg_L2": Pipeline([("scaler", StandardScaler()),
-                               ("clf", LogisticRegression(penalty="l2", solver="lbfgs", max_iter=2000))]),
         "DecisionTree": DecisionTreeClassifier(random_state=42),
         "RandomForest": RandomForestClassifier(random_state=42),
-        "BaggedDecisionTree": BaggingClassifier(DecisionTreeClassifier(random_state=42),
-                                                n_estimators=50, random_state=42)
     }
     param_grids = {
         "LogReg_L1": {"clf__C": [0.01, 0.1, 1, 10]},
         "LogReg_L2": {"clf__C": [0.01, 0.1, 1, 10]},
-        "DecisionTree": {"max_depth": [3,5,7,None], "min_samples_split": [2,5,10]},
-        "RandomForest": {"n_estimators": [100,200], "max_depth": [None,5,10], "min_samples_split": [2,5]},
-        "BaggedDecisionTree": {"n_estimators": [30,50,100]}
     }
     # Grid search + evaluation
     rows = []
     best_name, best_estimator, best_f1 = None, None, -1
     for name, model in models.items():
@@ -94,11 +111,22 @@ def train_model():
         gs = GridSearchCV(model, param_grids[name], scoring="f1", cv=cv, n_jobs=-1)
         gs.fit(X_train, y_train)
         y_pred = gs.best_estimator_.predict(X_test)
-        acc, f1 = accuracy_score(y_test, y_pred), f1_score(y_test, y_pred)
-        prec, rec = precision_score(y_test, y_pred), recall_score(y_test, y_pred)
         print(f"[GRID] {name} | best_params={gs.best_params_} | ACC={acc:.4f} F1={f1:.4f} P={prec:.4f} R={rec:.4f}")
-        rows.append({"Model": name, "BestParams": gs.best_params_, "Accuracy": acc, "F1": f1,
-                     "Precision": prec, "Recall": rec})
         if f1 > best_f1:
             best_f1, best_estimator, best_name = f1, gs.best_estimator_, name
@@ -112,28 +140,23 @@ def train_model():
     barplot_metric(results_df, "Accuracy", os.path.join(PLOTS_DIR, "model_accuracy.png"), "Model Accuracy (tuned)")
     barplot_metric(results_df, "F1", os.path.join(PLOTS_DIR, "model_f1.png"), "Model F1 (tuned)")
     # Best model diagnostics
     y_best = best_estimator.predict(X_test)
     plot_cm(y_test, y_best, f"Confusion Matrix – {best_name}", os.path.join(PLOTS_DIR, "confusion_matrix.png"))
     if hasattr(best_estimator, "predict_proba"):
         y_prob = best_estimator.predict_proba(X_test)[:, 1]
-        plot_roc(y_test, y_prob, f"ROC – {best_name}", os.path.join(PLOTS_DIR,"roc_curve.png"))
     # Save best model
-    model_path = os.path.join(MODEL_DIR, "best_model.pkl")
-    joblib.dump(best_estimator, model_path)
     print(f"[OK] Best model ({best_name}) saved with F1={best_f1:.4f}")
-    print(f"[OK] All plots saved -> {PLOTS_DIR}")
-    print(f"[OK] Reports saved -> {REPORTS_DIR}")
-    from sklearn.preprocessing import StandardScaler
-    from sklearn.linear_model import LogisticRegression
-    from sklearn.metrics import log_loss, accuracy_score
-    import numpy as np
-    import os
-    # Scale data
     scaler = StandardScaler()
     X_scaled = scaler.fit_transform(X_clean)
     X_train_g, X_test_g, y_train_g, y_test_g = train_test_split(
@@ -144,33 +167,29 @@ def train_model():
         clf = LogisticRegression(
             penalty=penalty,
             solver="saga",
-            warm_start=True,  # allows continuing training
-            max_iter=1,  # train one step at a time
             random_state=42
         )
         losses, accs = [], []
-        for i in range(max_iter):
-            clf.fit(X_train_g, y_train_g)  # trains 1 iteration per loop
             y_pred = clf.predict_proba(X_train_g)
             losses.append(log_loss(y_train_g, y_pred))
             accs.append(accuracy_score(y_train_g, np.argmax(y_pred, axis=1)))
         return losses, accs
-    # Collect curves
     loss_curves, acc_curves = {}, {}
     loss_curves["L2"], acc_curves["L2"] = track_training("l2", max_iter=50)
     loss_curves["L1"], acc_curves["L1"] = track_training("l1", max_iter=50)
-    # Plot curves
     lineplot_curves(
         loss_curves,
         ylabel="Log Loss",
         title="Logistic Regression – Loss vs Iterations",
         save_path=os.path.join(PLOTS_DIR, "logreg_loss_curves.png")
     )
     lineplot_curves(
         acc_curves,
         ylabel="Training Accuracy",
@@ -178,15 +197,7 @@ def train_model():
         save_path=os.path.join(PLOTS_DIR, "logreg_accuracy_curves.png")
     )
-    print(f"[OK] Reports saved under: {REPORTS_DIR}")
-    # Accuracy and F1 bar plots
-    # barplot_metric(results_df, "Accuracy", os.path.join(PLOTS_DIR, "model_accuracy.png"), "Model Accuracy (tuned)")
-    # barplot_metric(results_df, "F1", os.path.join(PLOTS_DIR, "model_f1.png"), "Model F1 (tuned)")
-    # plt.savefig(os.path.join(PLOTS_DIR, "variance_comparison.png"), bbox_inches='tight')
-    # plt.close()
-    barplot_metric(results_df, "Accuracy", os.path.join(PLOTS_DIR, "model_accuracy.png"), "Model Accuracy (tuned)")
-    barplot_metric(results_df, "F1", os.path.join(PLOTS_DIR, "model_f1.png"), "Model F1 (tuned)")
-    print(f"[OK] Plots saved -> {PLOTS_DIR}")
     return best_estimator

 import os
 import json
 import warnings
 import numpy as np
 import pandas as pd
 import joblib
 import matplotlib.pyplot as plt
 from datasets import load_dataset
+from scipy import stats
 from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
 from sklearn.pipeline import Pipeline
 from sklearn.preprocessing import StandardScaler
+from sklearn.metrics import (
+    accuracy_score, f1_score, precision_score, recall_score, classification_report, log_loss
+)
 from sklearn.linear_model import LogisticRegression
 from sklearn.tree import DecisionTreeClassifier
 from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
 def train_model():
+    # ------------------------------
     # Load dataset
+    # ------------------------------
     ds = load_dataset("jonathansuru/diabetes")
     df = ds["train"].to_pandas()
     X = df.drop("Outcome", axis=1)
     Y = df["Outcome"].astype(int)
     print(f"[INFO] Loaded dataset: {df.shape[0]} rows, {df.shape[1]} cols")
+    # ------------------------------
     # Outlier removal
+    # ------------------------------
     z = np.abs(stats.zscore(X))
     mask = (z < 3).all(axis=1)
     X_clean, Y_clean = X[mask], Y[mask]
     print(f"[INFO] Outliers removed: {len(X) - len(X_clean)} | Clean size: {len(X_clean)}")
+    # Save variance comparison plot
     var_df = pd.DataFrame({"Before": X.var(), "After": X_clean.var()})
     var_df.to_csv(os.path.join(REPORTS_DIR, "variance_before_after.csv"))
+    var_df.plot(kind="bar", figsize=(10, 5))
     plt.title("Feature Variance: Before vs After Outlier Removal")
     plt.ylabel("Variance")
     plt.xticks(rotation=45, ha="right")
     plt.savefig(os.path.join(PLOTS_DIR, "variance_comparison.png"), bbox_inches="tight")
     plt.close()
+    # ------------------------------
     # Train/test split
+    # ------------------------------
     X_train, X_test, y_train, y_test = train_test_split(
         X_clean, Y_clean, test_size=0.2, random_state=42, stratify=Y_clean
     )
+    # ------------------------------
+    # Models and hyperparameters
+    # ------------------------------
     cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
     models = {
+        "LogReg_L1": Pipeline([
+            ("scaler", StandardScaler()),
+            ("clf", LogisticRegression(penalty="l1", solver="liblinear", max_iter=2000))
+        ]),
+        "LogReg_L2": Pipeline([
+            ("scaler", StandardScaler()),
+            ("clf", LogisticRegression(penalty="l2", solver="lbfgs", max_iter=2000))
+        ]),
         "DecisionTree": DecisionTreeClassifier(random_state=42),
         "RandomForest": RandomForestClassifier(random_state=42),
+        "BaggedDecisionTree": BaggingClassifier(
+            DecisionTreeClassifier(random_state=42), n_estimators=50, random_state=42
+        )
     }
     param_grids = {
         "LogReg_L1": {"clf__C": [0.01, 0.1, 1, 10]},
         "LogReg_L2": {"clf__C": [0.01, 0.1, 1, 10]},
+        "DecisionTree": {"max_depth": [3, 5, 7, None], "min_samples_split": [2, 5, 10]},
+        "RandomForest": {"n_estimators": [100, 200], "max_depth": [None, 5, 10], "min_samples_split": [2, 5]},
+        "BaggedDecisionTree": {"n_estimators": [30, 50, 100]}
     }
+    # ------------------------------
     # Grid search + evaluation
+    # ------------------------------
     rows = []
     best_name, best_estimator, best_f1 = None, None, -1
     for name, model in models.items():
         gs = GridSearchCV(model, param_grids[name], scoring="f1", cv=cv, n_jobs=-1)
         gs.fit(X_train, y_train)
         y_pred = gs.best_estimator_.predict(X_test)
+        acc = accuracy_score(y_test, y_pred)
+        f1 = f1_score(y_test, y_pred)
+        prec = precision_score(y_test, y_pred)
+        rec = recall_score(y_test, y_pred)
         print(f"[GRID] {name} | best_params={gs.best_params_} | ACC={acc:.4f} F1={f1:.4f} P={prec:.4f} R={rec:.4f}")
+        rows.append({
+            "Model": name,
+            "BestParams": gs.best_params_,
+            "Accuracy": acc,
+            "F1": f1,
+            "Precision": prec,
+            "Recall": rec
+        })
         if f1 > best_f1:
             best_f1, best_estimator, best_name = f1, gs.best_estimator_, name
     barplot_metric(results_df, "Accuracy", os.path.join(PLOTS_DIR, "model_accuracy.png"), "Model Accuracy (tuned)")
     barplot_metric(results_df, "F1", os.path.join(PLOTS_DIR, "model_f1.png"), "Model F1 (tuned)")
+    # ------------------------------
     # Best model diagnostics
+    # ------------------------------
     y_best = best_estimator.predict(X_test)
     plot_cm(y_test, y_best, f"Confusion Matrix – {best_name}", os.path.join(PLOTS_DIR, "confusion_matrix.png"))
     if hasattr(best_estimator, "predict_proba"):
         y_prob = best_estimator.predict_proba(X_test)[:, 1]
+        plot_roc(y_test, y_prob, f"ROC – {best_name}", os.path.join(PLOTS_DIR, "roc_curve.png"))
     # Save best model
+    joblib.dump(best_estimator, os.path.join(MODEL_DIR, "best_model.pkl"))
     print(f"[OK] Best model ({best_name}) saved with F1={best_f1:.4f}")
+    # ------------------------------
+    # Logistic Regression loss/accuracy curves
+    # ------------------------------
     scaler = StandardScaler()
     X_scaled = scaler.fit_transform(X_clean)
     X_train_g, X_test_g, y_train_g, y_test_g = train_test_split(
         clf = LogisticRegression(
             penalty=penalty,
             solver="saga",
+            warm_start=True,
+            max_iter=1,
             random_state=42
         )
         losses, accs = [], []
+        for _ in range(max_iter):
+            clf.fit(X_train_g, y_train_g)
             y_pred = clf.predict_proba(X_train_g)
             losses.append(log_loss(y_train_g, y_pred))
             accs.append(accuracy_score(y_train_g, np.argmax(y_pred, axis=1)))
         return losses, accs
     loss_curves, acc_curves = {}, {}
     loss_curves["L2"], acc_curves["L2"] = track_training("l2", max_iter=50)
     loss_curves["L1"], acc_curves["L1"] = track_training("l1", max_iter=50)
     lineplot_curves(
         loss_curves,
         ylabel="Log Loss",
         title="Logistic Regression – Loss vs Iterations",
         save_path=os.path.join(PLOTS_DIR, "logreg_loss_curves.png")
     )
     lineplot_curves(
         acc_curves,
         ylabel="Training Accuracy",
         save_path=os.path.join(PLOTS_DIR, "logreg_accuracy_curves.png")
     )
+    print(f"[OK] All plots saved -> {PLOTS_DIR}")
+    print(f"[OK] Reports saved -> {REPORTS_DIR}")
     return best_estimator