import os import json import warnings from xgboost import XGBClassifier import numpy as np import pandas as pd import joblib import matplotlib.pyplot as plt from datasets import load_dataset from scipy import stats from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler from sklearn.metrics import ( accuracy_score, f1_score, precision_score, recall_score, classification_report, log_loss ) from sklearn.linear_model import LogisticRegression from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, BaggingClassifier from .utils import ensure_dirs, save_json, plot_cm, plot_roc, barplot_metric, lineplot_curves warnings.filterwarnings("ignore") # ------------------------------ # Base paths # ------------------------------ BASE_DIR = os.getcwd() # repo folder in Hugging Face Spaces MODEL_DIR = os.path.join(BASE_DIR, "models") REPORTS_DIR = os.path.join(BASE_DIR, "reports") PLOTS_DIR = os.path.join(REPORTS_DIR, "plots") # Ensure folders exist os.makedirs(MODEL_DIR, exist_ok=True) os.makedirs(REPORTS_DIR, exist_ok=True) os.makedirs(PLOTS_DIR, exist_ok=True) def train_model(): # ------------------------------ # Load dataset # ------------------------------ ds = load_dataset("jonathansuru/diabetes") df = ds["train"].to_pandas() X = df.drop("Outcome", axis=1) Y = df["Outcome"].astype(int) print(f"[INFO] Loaded dataset: {df.shape[0]} rows, {df.shape[1]} cols") # ------------------------------ # Outlier removal # ------------------------------ z = np.abs(stats.zscore(X)) mask = (z < 3).all(axis=1) X_clean, Y_clean = X[mask], Y[mask] print(f"[INFO] Outliers removed: {len(X) - len(X_clean)} | Clean size: {len(X_clean)}") # Save variance comparison plot var_df = pd.DataFrame({"Before": X.var(), "After": X_clean.var()}) var_df.to_csv(os.path.join(REPORTS_DIR, "variance_before_after.csv")) var_df.plot(kind="bar", figsize=(10, 5)) plt.title("Feature Variance: Before vs After Outlier Removal") plt.ylabel("Variance") plt.xticks(rotation=45, ha="right") plt.tight_layout() plt.savefig(os.path.join(PLOTS_DIR, "variance_comparison.png"), bbox_inches="tight") plt.close() # ------------------------------ # Train/test split # ------------------------------ X_train, X_test, y_train, y_test = train_test_split( X_clean, Y_clean, test_size=0.2, random_state=42, stratify=Y_clean ) # ------------------------------ # Models and hyperparameters # ------------------------------ cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) models = { "LogReg_L1": Pipeline([ ("scaler", StandardScaler()), ("clf", LogisticRegression(penalty="l1", solver="liblinear")) ]), "LogReg_L2": Pipeline([ ("scaler", StandardScaler()), ("clf", LogisticRegression(penalty="l2", solver="lbfgs")) ]), "DecisionTree": DecisionTreeClassifier(random_state=42), "RandomForest": RandomForestClassifier(random_state=42), "BaggedDecisionTree": BaggingClassifier( DecisionTreeClassifier(random_state=42), n_estimators=50, random_state=42 ), "XGBoost": XGBClassifier( use_label_encoder=False, eval_metric="logloss", random_state=42 ) } param_grids = { "LogReg_L1": {"clf__C": [0.01, 0.1, 1, 10]}, "LogReg_L2": {"clf__C": [0.01, 0.1, 1, 10]}, "DecisionTree": {"max_depth": [3, 5, 7, None], "min_samples_split": [2, 5, 10]}, "RandomForest": {"n_estimators": [100, 200], "max_depth": [None, 5, 10], "min_samples_split": [2, 5]}, "BaggedDecisionTree": {"n_estimators": [30, 50, 100]}, "XGBoost": { "n_estimators": [100, 200], "max_depth": [3, 5, 7], "learning_rate": [0.01, 0.1, 0.2], "subsample": [0.8, 1.0] } } # ------------------------------ # Grid search + evaluation # ------------------------------ rows = [] best_name, best_estimator, best_f1 = None, None, -1 for name, model in models.items(): print(f"[GRID] Tuning {name} …") gs = GridSearchCV(model, param_grids[name], scoring="f1", cv=cv, n_jobs=-1) gs.fit(X_train, y_train) y_pred = gs.best_estimator_.predict(X_test) acc = accuracy_score(y_test, y_pred) f1 = f1_score(y_test, y_pred) prec = precision_score(y_test, y_pred) rec = recall_score(y_test, y_pred) print(f"[GRID] {name} | best_params={gs.best_params_} | ACC={acc:.4f} F1={f1:.4f} P={prec:.4f} R={rec:.4f}") rows.append({ "Model": name, "BestParams": gs.best_params_, "Accuracy": acc, "F1": f1, "Precision": prec, "Recall": rec }) if f1 > best_f1: best_f1, best_estimator, best_name = f1, gs.best_estimator_, name # Save model comparison results_df = pd.DataFrame(rows).sort_values(by="F1", ascending=False) results_df.to_csv(os.path.join(REPORTS_DIR, "model_comparison.csv"), index=False) with open(os.path.join(REPORTS_DIR, "model_comparison.json"), "w") as f: json.dump(rows, f, indent=4) # Plot Accuracy and F1 barplots barplot_metric(results_df, "Accuracy", os.path.join(PLOTS_DIR, "model_accuracy.png"), "Model Accuracy (tuned)") barplot_metric(results_df, "F1", os.path.join(PLOTS_DIR, "model_f1.png"), "Model F1 (tuned)") # ------------------------------ # Best model diagnostics # ------------------------------ y_best = best_estimator.predict(X_test) plot_cm(y_test, y_best, f"Confusion Matrix – {best_name}", os.path.join(PLOTS_DIR, "confusion_matrix.png")) if hasattr(best_estimator, "predict_proba"): y_prob = best_estimator.predict_proba(X_test)[:, 1] plot_roc(y_test, y_prob, f"ROC – {best_name}", os.path.join(PLOTS_DIR, "roc_curve.png")) # Save best model joblib.dump(best_estimator, os.path.join(MODEL_DIR, "best_model.pkl")) print(f"[OK] Best model ({best_name}) saved with F1={best_f1:.4f}") # ------------------------------ # Logistic Regression loss/accuracy curves # ------------------------------ scaler = StandardScaler() X_scaled = scaler.fit_transform(X_clean) X_train_g, X_test_g, y_train_g, y_test_g = train_test_split( X_scaled, Y_clean, test_size=0.2, random_state=42, stratify=Y_clean ) def track_training(penalty, max_iter=50): clf = LogisticRegression( penalty=penalty, solver="saga", warm_start=True, max_iter=1, random_state=42 ) losses, accs = [], [] for _ in range(max_iter): clf.fit(X_train_g, y_train_g) y_pred = clf.predict_proba(X_train_g) losses.append(log_loss(y_train_g, y_pred)) accs.append(accuracy_score(y_train_g, np.argmax(y_pred, axis=1))) return losses, accs loss_curves, acc_curves = {}, {} loss_curves["L2"], acc_curves["L2"] = track_training("l2", max_iter=50) loss_curves["L1"], acc_curves["L1"] = track_training("l1", max_iter=50) lineplot_curves( loss_curves, ylabel="Log Loss", title="Logistic Regression – Loss vs Iterations", save_path=os.path.join(PLOTS_DIR, "logreg_loss_curves.png") ) lineplot_curves( acc_curves, ylabel="Training Accuracy", title="Logistic Regression – Accuracy vs Iterations", save_path=os.path.join(PLOTS_DIR, "logreg_accuracy_curves.png") ) print(f"[OK] All plots saved -> {PLOTS_DIR}") print(f"[OK] Reports saved -> {REPORTS_DIR}") return best_estimator