"""Train and persist the dual-task obesity models. Two head-to-head model comparisons are run on the UCI Obesity Levels dataset (`aiml2021/obesity`): - Regression head — predict BMI from demographics + habits + activity. Ridge baseline vs XGBRegressor. - Classification head — predict the 7-class obesity level (NObeyesdad). LogisticRegression baseline vs XGBClassifier. Whichever model wins on the held-out test fold is persisted, together with feature columns, baseline metrics, and per-class breakdown in ``models/numeric_metadata.json``. """ from __future__ import annotations import json from pathlib import Path import joblib from sklearn.linear_model import LogisticRegression, Ridge from sklearn.metrics import ( accuracy_score, classification_report, f1_score, mean_absolute_error, r2_score, ) from sklearn.model_selection import train_test_split from sklearn.pipeline import Pipeline from sklearn.preprocessing import LabelEncoder, StandardScaler from xgboost import XGBClassifier, XGBRegressor from .obesity import OBESITY_LEVELS, build_features, load MODELS_DIR = Path(__file__).resolve().parents[2] / "models" SEED = 42 def train_regressor(X_train, X_test, y_train, y_test): ridge = Pipeline([ ("scaler", StandardScaler()), ("model", Ridge(alpha=1.0)), ]).fit(X_train, y_train) ridge_mae = mean_absolute_error(y_test, ridge.predict(X_test)) ridge_r2 = r2_score(y_test, ridge.predict(X_test)) xgb = XGBRegressor( n_estimators=400, max_depth=5, learning_rate=0.05, subsample=0.9, colsample_bytree=0.9, random_state=SEED, ).fit(X_train, y_train) xgb_mae = mean_absolute_error(y_test, xgb.predict(X_test)) xgb_r2 = r2_score(y_test, xgb.predict(X_test)) if xgb_mae <= ridge_mae: return xgb, "XGBRegressor", {"mae": xgb_mae, "r2": xgb_r2}, {"ridge_mae": ridge_mae, "ridge_r2": ridge_r2} return ridge, "Ridge", {"mae": ridge_mae, "r2": ridge_r2}, {"xgb_mae": xgb_mae, "xgb_r2": xgb_r2} def train_classifier(X_train, X_test, y_train, y_test): logit = Pipeline([ ("scaler", StandardScaler()), ("model", LogisticRegression(max_iter=2000)), ]).fit(X_train, y_train) logit_pred = logit.predict(X_test) logit_acc = accuracy_score(y_test, logit_pred) logit_f1 = f1_score(y_test, logit_pred, average="macro") xgb = XGBClassifier( n_estimators=400, max_depth=5, learning_rate=0.05, subsample=0.9, colsample_bytree=0.9, random_state=SEED, eval_metric="mlogloss", num_class=len(OBESITY_LEVELS), ).fit(X_train, y_train) xgb_pred = xgb.predict(X_test) xgb_acc = accuracy_score(y_test, xgb_pred) xgb_f1 = f1_score(y_test, xgb_pred, average="macro") if xgb_f1 >= logit_f1: return ( xgb, "XGBClassifier", {"accuracy": xgb_acc, "macro_f1": xgb_f1}, {"logit_accuracy": logit_acc, "logit_macro_f1": logit_f1}, xgb_pred, ) return ( logit, "LogisticRegression", {"accuracy": logit_acc, "macro_f1": logit_f1}, {"xgb_accuracy": xgb_acc, "xgb_macro_f1": xgb_f1}, logit_pred, ) def main() -> None: print("Loading UCI Obesity Levels dataset...") df = load() ds = build_features(df) X = ds.features.astype("float64") y_bmi = ds.bmi.values label_enc = LabelEncoder().fit(OBESITY_LEVELS) y_cls = label_enc.transform(ds.label.values) X_train, X_test, y_bmi_train, y_bmi_test, y_cls_train, y_cls_test = train_test_split( X, y_bmi, y_cls, test_size=0.2, random_state=SEED, stratify=y_cls, ) print("Training regressor (Ridge vs XGB)...") reg, reg_name, reg_metrics, reg_baseline = train_regressor(X_train, X_test, y_bmi_train, y_bmi_test) print(f" -> chose {reg_name}: {reg_metrics}") print(f" baseline: {reg_baseline}") print("Training classifier (LogisticRegression vs XGB)...") clf, clf_name, clf_metrics, clf_baseline, clf_pred = train_classifier( X_train, X_test, y_cls_train, y_cls_test, ) print(f" -> chose {clf_name}: {clf_metrics}") print(f" baseline: {clf_baseline}") MODELS_DIR.mkdir(parents=True, exist_ok=True) joblib.dump(reg, MODELS_DIR / "numeric_regressor.pkl") joblib.dump(clf, MODELS_DIR / "numeric_classifier.pkl") joblib.dump(label_enc, MODELS_DIR / "numeric_label_encoder.pkl") report = classification_report( y_cls_test, clf_pred, labels=list(range(len(OBESITY_LEVELS))), target_names=OBESITY_LEVELS, output_dict=True, zero_division=0, ) metadata = { "dataset": "aiml2021/obesity", "feature_columns": ds.feature_columns, "classes": OBESITY_LEVELS, "n_train": int(len(X_train)), "n_test": int(len(X_test)), "regressor": { "name": reg_name, "target": "BMI", "metrics": {k: float(v) for k, v in reg_metrics.items()}, "baseline_metrics": {k: float(v) for k, v in reg_baseline.items()}, }, "classifier": { "name": clf_name, "target": "NObeyesdad", "metrics": {k: float(v) for k, v in clf_metrics.items()}, "baseline_metrics": {k: float(v) for k, v in clf_baseline.items()}, "per_class": { cls: { "precision": float(report[cls]["precision"]), "recall": float(report[cls]["recall"]), "f1": float(report[cls]["f1-score"]), "support": int(report[cls]["support"]), } for cls in OBESITY_LEVELS if cls in report }, }, } (MODELS_DIR / "numeric_metadata.json").write_text(json.dumps(metadata, indent=2)) print(f"\nSaved artifacts to {MODELS_DIR}") if __name__ == "__main__": main()