Spaces:
Sleeping
Sleeping
| import os | |
| import json | |
| import warnings | |
| from xgboost import XGBClassifier | |
| import numpy as np | |
| import pandas as pd | |
| import joblib | |
| import matplotlib.pyplot as plt | |
| from datasets import load_dataset | |
| from scipy import stats | |
| from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold | |
| from sklearn.pipeline import Pipeline | |
| from sklearn.preprocessing import StandardScaler | |
| from sklearn.metrics import ( | |
| accuracy_score, f1_score, precision_score, recall_score, classification_report, log_loss | |
| ) | |
| from sklearn.linear_model import LogisticRegression | |
| from sklearn.tree import DecisionTreeClassifier | |
| from sklearn.ensemble import RandomForestClassifier, BaggingClassifier | |
| from .utils import ensure_dirs, save_json, plot_cm, plot_roc, barplot_metric, lineplot_curves | |
| warnings.filterwarnings("ignore") | |
| # ------------------------------ | |
| # Base paths | |
| # ------------------------------ | |
| BASE_DIR = os.getcwd() # repo folder in Hugging Face Spaces | |
| MODEL_DIR = os.path.join(BASE_DIR, "models") | |
| REPORTS_DIR = os.path.join(BASE_DIR, "reports") | |
| PLOTS_DIR = os.path.join(REPORTS_DIR, "plots") | |
| # Ensure folders exist | |
| os.makedirs(MODEL_DIR, exist_ok=True) | |
| os.makedirs(REPORTS_DIR, exist_ok=True) | |
| os.makedirs(PLOTS_DIR, exist_ok=True) | |
| def train_model(): | |
| # ------------------------------ | |
| # Load dataset | |
| # ------------------------------ | |
| ds = load_dataset("jonathansuru/diabetes") | |
| df = ds["train"].to_pandas() | |
| X = df.drop("Outcome", axis=1) | |
| Y = df["Outcome"].astype(int) | |
| print(f"[INFO] Loaded dataset: {df.shape[0]} rows, {df.shape[1]} cols") | |
| # ------------------------------ | |
| # Outlier removal | |
| # ------------------------------ | |
| z = np.abs(stats.zscore(X)) | |
| mask = (z < 3).all(axis=1) | |
| X_clean, Y_clean = X[mask], Y[mask] | |
| print(f"[INFO] Outliers removed: {len(X) - len(X_clean)} | Clean size: {len(X_clean)}") | |
| # Save variance comparison plot | |
| var_df = pd.DataFrame({"Before": X.var(), "After": X_clean.var()}) | |
| var_df.to_csv(os.path.join(REPORTS_DIR, "variance_before_after.csv")) | |
| var_df.plot(kind="bar", figsize=(10, 5)) | |
| plt.title("Feature Variance: Before vs After Outlier Removal") | |
| plt.ylabel("Variance") | |
| plt.xticks(rotation=45, ha="right") | |
| plt.tight_layout() | |
| plt.savefig(os.path.join(PLOTS_DIR, "variance_comparison.png"), bbox_inches="tight") | |
| plt.close() | |
| # ------------------------------ | |
| # Train/test split | |
| # ------------------------------ | |
| X_train, X_test, y_train, y_test = train_test_split( | |
| X_clean, Y_clean, test_size=0.2, random_state=42, stratify=Y_clean | |
| ) | |
| # ------------------------------ | |
| # Models and hyperparameters | |
| # ------------------------------ | |
| cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42) | |
| models = { | |
| "LogReg_L1": Pipeline([ | |
| ("scaler", StandardScaler()), | |
| ("clf", LogisticRegression(penalty="l1", solver="liblinear")) | |
| ]), | |
| "LogReg_L2": Pipeline([ | |
| ("scaler", StandardScaler()), | |
| ("clf", LogisticRegression(penalty="l2", solver="lbfgs")) | |
| ]), | |
| "DecisionTree": DecisionTreeClassifier(random_state=42), | |
| "RandomForest": RandomForestClassifier(random_state=42), | |
| "BaggedDecisionTree": BaggingClassifier( | |
| DecisionTreeClassifier(random_state=42), n_estimators=50, random_state=42 | |
| ), | |
| "XGBoost": XGBClassifier( | |
| use_label_encoder=False, | |
| eval_metric="logloss", | |
| random_state=42 | |
| ) | |
| } | |
| param_grids = { | |
| "LogReg_L1": {"clf__C": [0.01, 0.1, 1, 10]}, | |
| "LogReg_L2": {"clf__C": [0.01, 0.1, 1, 10]}, | |
| "DecisionTree": {"max_depth": [3, 5, 7, None], "min_samples_split": [2, 5, 10]}, | |
| "RandomForest": {"n_estimators": [100, 200], "max_depth": [None, 5, 10], "min_samples_split": [2, 5]}, | |
| "BaggedDecisionTree": {"n_estimators": [30, 50, 100]}, | |
| "XGBoost": { | |
| "n_estimators": [100, 200], | |
| "max_depth": [3, 5, 7], | |
| "learning_rate": [0.01, 0.1, 0.2], | |
| "subsample": [0.8, 1.0] | |
| } | |
| } | |
| # ------------------------------ | |
| # Grid search + evaluation | |
| # ------------------------------ | |
| rows = [] | |
| best_name, best_estimator, best_f1 = None, None, -1 | |
| for name, model in models.items(): | |
| print(f"[GRID] Tuning {name} …") | |
| gs = GridSearchCV(model, param_grids[name], scoring="f1", cv=cv, n_jobs=-1) | |
| gs.fit(X_train, y_train) | |
| y_pred = gs.best_estimator_.predict(X_test) | |
| acc = accuracy_score(y_test, y_pred) | |
| f1 = f1_score(y_test, y_pred) | |
| prec = precision_score(y_test, y_pred) | |
| rec = recall_score(y_test, y_pred) | |
| print(f"[GRID] {name} | best_params={gs.best_params_} | ACC={acc:.4f} F1={f1:.4f} P={prec:.4f} R={rec:.4f}") | |
| rows.append({ | |
| "Model": name, | |
| "BestParams": gs.best_params_, | |
| "Accuracy": acc, | |
| "F1": f1, | |
| "Precision": prec, | |
| "Recall": rec | |
| }) | |
| if f1 > best_f1: | |
| best_f1, best_estimator, best_name = f1, gs.best_estimator_, name | |
| # Save model comparison | |
| results_df = pd.DataFrame(rows).sort_values(by="F1", ascending=False) | |
| results_df.to_csv(os.path.join(REPORTS_DIR, "model_comparison.csv"), index=False) | |
| with open(os.path.join(REPORTS_DIR, "model_comparison.json"), "w") as f: | |
| json.dump(rows, f, indent=4) | |
| # Plot Accuracy and F1 barplots | |
| barplot_metric(results_df, "Accuracy", os.path.join(PLOTS_DIR, "model_accuracy.png"), "Model Accuracy (tuned)") | |
| barplot_metric(results_df, "F1", os.path.join(PLOTS_DIR, "model_f1.png"), "Model F1 (tuned)") | |
| # ------------------------------ | |
| # Best model diagnostics | |
| # ------------------------------ | |
| y_best = best_estimator.predict(X_test) | |
| plot_cm(y_test, y_best, f"Confusion Matrix – {best_name}", os.path.join(PLOTS_DIR, "confusion_matrix.png")) | |
| if hasattr(best_estimator, "predict_proba"): | |
| y_prob = best_estimator.predict_proba(X_test)[:, 1] | |
| plot_roc(y_test, y_prob, f"ROC – {best_name}", os.path.join(PLOTS_DIR, "roc_curve.png")) | |
| # Save best model | |
| joblib.dump(best_estimator, os.path.join(MODEL_DIR, "best_model.pkl")) | |
| print(f"[OK] Best model ({best_name}) saved with F1={best_f1:.4f}") | |
| # ------------------------------ | |
| # Logistic Regression loss/accuracy curves | |
| # ------------------------------ | |
| scaler = StandardScaler() | |
| X_scaled = scaler.fit_transform(X_clean) | |
| X_train_g, X_test_g, y_train_g, y_test_g = train_test_split( | |
| X_scaled, Y_clean, test_size=0.2, random_state=42, stratify=Y_clean | |
| ) | |
| def track_training(penalty, max_iter=50): | |
| clf = LogisticRegression( | |
| penalty=penalty, | |
| solver="saga", | |
| warm_start=True, | |
| max_iter=1, | |
| random_state=42 | |
| ) | |
| losses, accs = [], [] | |
| for _ in range(max_iter): | |
| clf.fit(X_train_g, y_train_g) | |
| y_pred = clf.predict_proba(X_train_g) | |
| losses.append(log_loss(y_train_g, y_pred)) | |
| accs.append(accuracy_score(y_train_g, np.argmax(y_pred, axis=1))) | |
| return losses, accs | |
| loss_curves, acc_curves = {}, {} | |
| loss_curves["L2"], acc_curves["L2"] = track_training("l2", max_iter=50) | |
| loss_curves["L1"], acc_curves["L1"] = track_training("l1", max_iter=50) | |
| lineplot_curves( | |
| loss_curves, | |
| ylabel="Log Loss", | |
| title="Logistic Regression – Loss vs Iterations", | |
| save_path=os.path.join(PLOTS_DIR, "logreg_loss_curves.png") | |
| ) | |
| lineplot_curves( | |
| acc_curves, | |
| ylabel="Training Accuracy", | |
| title="Logistic Regression – Accuracy vs Iterations", | |
| save_path=os.path.join(PLOTS_DIR, "logreg_accuracy_curves.png") | |
| ) | |
| print(f"[OK] All plots saved -> {PLOTS_DIR}") | |
| print(f"[OK] Reports saved -> {REPORTS_DIR}") | |
| return best_estimator | |