Spaces:
Sleeping
Sleeping
| """Train and persist the dual-task obesity models. | |
| Two head-to-head model comparisons are run on the UCI Obesity Levels | |
| dataset (`aiml2021/obesity`): | |
| - Regression head — predict BMI from demographics + habits + activity. | |
| Ridge baseline vs XGBRegressor. | |
| - Classification head — predict the 7-class obesity level (NObeyesdad). | |
| LogisticRegression baseline vs XGBClassifier. | |
| Whichever model wins on the held-out test fold is persisted, together | |
| with feature columns, baseline metrics, and per-class breakdown in | |
| ``models/numeric_metadata.json``. | |
| """ | |
| from __future__ import annotations | |
| import json | |
| from pathlib import Path | |
| import joblib | |
| from sklearn.linear_model import LogisticRegression, Ridge | |
| from sklearn.metrics import ( | |
| accuracy_score, | |
| classification_report, | |
| f1_score, | |
| mean_absolute_error, | |
| r2_score, | |
| ) | |
| from sklearn.model_selection import train_test_split | |
| from sklearn.pipeline import Pipeline | |
| from sklearn.preprocessing import LabelEncoder, StandardScaler | |
| from xgboost import XGBClassifier, XGBRegressor | |
| from .obesity import OBESITY_LEVELS, build_features, load | |
| MODELS_DIR = Path(__file__).resolve().parents[2] / "models" | |
| SEED = 42 | |
| def train_regressor(X_train, X_test, y_train, y_test): | |
| ridge = Pipeline([ | |
| ("scaler", StandardScaler()), | |
| ("model", Ridge(alpha=1.0)), | |
| ]).fit(X_train, y_train) | |
| ridge_mae = mean_absolute_error(y_test, ridge.predict(X_test)) | |
| ridge_r2 = r2_score(y_test, ridge.predict(X_test)) | |
| xgb = XGBRegressor( | |
| n_estimators=400, max_depth=5, learning_rate=0.05, | |
| subsample=0.9, colsample_bytree=0.9, random_state=SEED, | |
| ).fit(X_train, y_train) | |
| xgb_mae = mean_absolute_error(y_test, xgb.predict(X_test)) | |
| xgb_r2 = r2_score(y_test, xgb.predict(X_test)) | |
| if xgb_mae <= ridge_mae: | |
| return xgb, "XGBRegressor", {"mae": xgb_mae, "r2": xgb_r2}, {"ridge_mae": ridge_mae, "ridge_r2": ridge_r2} | |
| return ridge, "Ridge", {"mae": ridge_mae, "r2": ridge_r2}, {"xgb_mae": xgb_mae, "xgb_r2": xgb_r2} | |
| def train_classifier(X_train, X_test, y_train, y_test): | |
| logit = Pipeline([ | |
| ("scaler", StandardScaler()), | |
| ("model", LogisticRegression(max_iter=2000)), | |
| ]).fit(X_train, y_train) | |
| logit_pred = logit.predict(X_test) | |
| logit_acc = accuracy_score(y_test, logit_pred) | |
| logit_f1 = f1_score(y_test, logit_pred, average="macro") | |
| xgb = XGBClassifier( | |
| n_estimators=400, max_depth=5, learning_rate=0.05, | |
| subsample=0.9, colsample_bytree=0.9, random_state=SEED, | |
| eval_metric="mlogloss", num_class=len(OBESITY_LEVELS), | |
| ).fit(X_train, y_train) | |
| xgb_pred = xgb.predict(X_test) | |
| xgb_acc = accuracy_score(y_test, xgb_pred) | |
| xgb_f1 = f1_score(y_test, xgb_pred, average="macro") | |
| if xgb_f1 >= logit_f1: | |
| return ( | |
| xgb, "XGBClassifier", | |
| {"accuracy": xgb_acc, "macro_f1": xgb_f1}, | |
| {"logit_accuracy": logit_acc, "logit_macro_f1": logit_f1}, | |
| xgb_pred, | |
| ) | |
| return ( | |
| logit, "LogisticRegression", | |
| {"accuracy": logit_acc, "macro_f1": logit_f1}, | |
| {"xgb_accuracy": xgb_acc, "xgb_macro_f1": xgb_f1}, | |
| logit_pred, | |
| ) | |
| def main() -> None: | |
| print("Loading UCI Obesity Levels dataset...") | |
| df = load() | |
| ds = build_features(df) | |
| X = ds.features.astype("float64") | |
| y_bmi = ds.bmi.values | |
| label_enc = LabelEncoder().fit(OBESITY_LEVELS) | |
| y_cls = label_enc.transform(ds.label.values) | |
| X_train, X_test, y_bmi_train, y_bmi_test, y_cls_train, y_cls_test = train_test_split( | |
| X, y_bmi, y_cls, test_size=0.2, random_state=SEED, stratify=y_cls, | |
| ) | |
| print("Training regressor (Ridge vs XGB)...") | |
| reg, reg_name, reg_metrics, reg_baseline = train_regressor(X_train, X_test, y_bmi_train, y_bmi_test) | |
| print(f" -> chose {reg_name}: {reg_metrics}") | |
| print(f" baseline: {reg_baseline}") | |
| print("Training classifier (LogisticRegression vs XGB)...") | |
| clf, clf_name, clf_metrics, clf_baseline, clf_pred = train_classifier( | |
| X_train, X_test, y_cls_train, y_cls_test, | |
| ) | |
| print(f" -> chose {clf_name}: {clf_metrics}") | |
| print(f" baseline: {clf_baseline}") | |
| MODELS_DIR.mkdir(parents=True, exist_ok=True) | |
| joblib.dump(reg, MODELS_DIR / "numeric_regressor.pkl") | |
| joblib.dump(clf, MODELS_DIR / "numeric_classifier.pkl") | |
| joblib.dump(label_enc, MODELS_DIR / "numeric_label_encoder.pkl") | |
| report = classification_report( | |
| y_cls_test, clf_pred, | |
| labels=list(range(len(OBESITY_LEVELS))), | |
| target_names=OBESITY_LEVELS, output_dict=True, zero_division=0, | |
| ) | |
| metadata = { | |
| "dataset": "aiml2021/obesity", | |
| "feature_columns": ds.feature_columns, | |
| "classes": OBESITY_LEVELS, | |
| "n_train": int(len(X_train)), | |
| "n_test": int(len(X_test)), | |
| "regressor": { | |
| "name": reg_name, | |
| "target": "BMI", | |
| "metrics": {k: float(v) for k, v in reg_metrics.items()}, | |
| "baseline_metrics": {k: float(v) for k, v in reg_baseline.items()}, | |
| }, | |
| "classifier": { | |
| "name": clf_name, | |
| "target": "NObeyesdad", | |
| "metrics": {k: float(v) for k, v in clf_metrics.items()}, | |
| "baseline_metrics": {k: float(v) for k, v in clf_baseline.items()}, | |
| "per_class": { | |
| cls: { | |
| "precision": float(report[cls]["precision"]), | |
| "recall": float(report[cls]["recall"]), | |
| "f1": float(report[cls]["f1-score"]), | |
| "support": int(report[cls]["support"]), | |
| } | |
| for cls in OBESITY_LEVELS if cls in report | |
| }, | |
| }, | |
| } | |
| (MODELS_DIR / "numeric_metadata.json").write_text(json.dumps(metadata, indent=2)) | |
| print(f"\nSaved artifacts to {MODELS_DIR}") | |
| if __name__ == "__main__": | |
| main() | |