forkcast / src /numeric /train.py
adisaljusi's picture
numeric: fix LogisticRegression API for sklearn 1.8; retrain artifacts
5fbb6c5 unverified
"""Train and persist the dual-task obesity models.
Two head-to-head model comparisons are run on the UCI Obesity Levels
dataset (`aiml2021/obesity`):
- Regression head — predict BMI from demographics + habits + activity.
Ridge baseline vs XGBRegressor.
- Classification head — predict the 7-class obesity level (NObeyesdad).
LogisticRegression baseline vs XGBClassifier.
Whichever model wins on the held-out test fold is persisted, together
with feature columns, baseline metrics, and per-class breakdown in
``models/numeric_metadata.json``.
"""
from __future__ import annotations
import json
from pathlib import Path
import joblib
from sklearn.linear_model import LogisticRegression, Ridge
from sklearn.metrics import (
accuracy_score,
classification_report,
f1_score,
mean_absolute_error,
r2_score,
)
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder, StandardScaler
from xgboost import XGBClassifier, XGBRegressor
from .obesity import OBESITY_LEVELS, build_features, load
MODELS_DIR = Path(__file__).resolve().parents[2] / "models"
SEED = 42
def train_regressor(X_train, X_test, y_train, y_test):
ridge = Pipeline([
("scaler", StandardScaler()),
("model", Ridge(alpha=1.0)),
]).fit(X_train, y_train)
ridge_mae = mean_absolute_error(y_test, ridge.predict(X_test))
ridge_r2 = r2_score(y_test, ridge.predict(X_test))
xgb = XGBRegressor(
n_estimators=400, max_depth=5, learning_rate=0.05,
subsample=0.9, colsample_bytree=0.9, random_state=SEED,
).fit(X_train, y_train)
xgb_mae = mean_absolute_error(y_test, xgb.predict(X_test))
xgb_r2 = r2_score(y_test, xgb.predict(X_test))
if xgb_mae <= ridge_mae:
return xgb, "XGBRegressor", {"mae": xgb_mae, "r2": xgb_r2}, {"ridge_mae": ridge_mae, "ridge_r2": ridge_r2}
return ridge, "Ridge", {"mae": ridge_mae, "r2": ridge_r2}, {"xgb_mae": xgb_mae, "xgb_r2": xgb_r2}
def train_classifier(X_train, X_test, y_train, y_test):
logit = Pipeline([
("scaler", StandardScaler()),
("model", LogisticRegression(max_iter=2000)),
]).fit(X_train, y_train)
logit_pred = logit.predict(X_test)
logit_acc = accuracy_score(y_test, logit_pred)
logit_f1 = f1_score(y_test, logit_pred, average="macro")
xgb = XGBClassifier(
n_estimators=400, max_depth=5, learning_rate=0.05,
subsample=0.9, colsample_bytree=0.9, random_state=SEED,
eval_metric="mlogloss", num_class=len(OBESITY_LEVELS),
).fit(X_train, y_train)
xgb_pred = xgb.predict(X_test)
xgb_acc = accuracy_score(y_test, xgb_pred)
xgb_f1 = f1_score(y_test, xgb_pred, average="macro")
if xgb_f1 >= logit_f1:
return (
xgb, "XGBClassifier",
{"accuracy": xgb_acc, "macro_f1": xgb_f1},
{"logit_accuracy": logit_acc, "logit_macro_f1": logit_f1},
xgb_pred,
)
return (
logit, "LogisticRegression",
{"accuracy": logit_acc, "macro_f1": logit_f1},
{"xgb_accuracy": xgb_acc, "xgb_macro_f1": xgb_f1},
logit_pred,
)
def main() -> None:
print("Loading UCI Obesity Levels dataset...")
df = load()
ds = build_features(df)
X = ds.features.astype("float64")
y_bmi = ds.bmi.values
label_enc = LabelEncoder().fit(OBESITY_LEVELS)
y_cls = label_enc.transform(ds.label.values)
X_train, X_test, y_bmi_train, y_bmi_test, y_cls_train, y_cls_test = train_test_split(
X, y_bmi, y_cls, test_size=0.2, random_state=SEED, stratify=y_cls,
)
print("Training regressor (Ridge vs XGB)...")
reg, reg_name, reg_metrics, reg_baseline = train_regressor(X_train, X_test, y_bmi_train, y_bmi_test)
print(f" -> chose {reg_name}: {reg_metrics}")
print(f" baseline: {reg_baseline}")
print("Training classifier (LogisticRegression vs XGB)...")
clf, clf_name, clf_metrics, clf_baseline, clf_pred = train_classifier(
X_train, X_test, y_cls_train, y_cls_test,
)
print(f" -> chose {clf_name}: {clf_metrics}")
print(f" baseline: {clf_baseline}")
MODELS_DIR.mkdir(parents=True, exist_ok=True)
joblib.dump(reg, MODELS_DIR / "numeric_regressor.pkl")
joblib.dump(clf, MODELS_DIR / "numeric_classifier.pkl")
joblib.dump(label_enc, MODELS_DIR / "numeric_label_encoder.pkl")
report = classification_report(
y_cls_test, clf_pred,
labels=list(range(len(OBESITY_LEVELS))),
target_names=OBESITY_LEVELS, output_dict=True, zero_division=0,
)
metadata = {
"dataset": "aiml2021/obesity",
"feature_columns": ds.feature_columns,
"classes": OBESITY_LEVELS,
"n_train": int(len(X_train)),
"n_test": int(len(X_test)),
"regressor": {
"name": reg_name,
"target": "BMI",
"metrics": {k: float(v) for k, v in reg_metrics.items()},
"baseline_metrics": {k: float(v) for k, v in reg_baseline.items()},
},
"classifier": {
"name": clf_name,
"target": "NObeyesdad",
"metrics": {k: float(v) for k, v in clf_metrics.items()},
"baseline_metrics": {k: float(v) for k, v in clf_baseline.items()},
"per_class": {
cls: {
"precision": float(report[cls]["precision"]),
"recall": float(report[cls]["recall"]),
"f1": float(report[cls]["f1-score"]),
"support": int(report[cls]["support"]),
}
for cls in OBESITY_LEVELS if cls in report
},
},
}
(MODELS_DIR / "numeric_metadata.json").write_text(json.dumps(metadata, indent=2))
print(f"\nSaved artifacts to {MODELS_DIR}")
if __name__ == "__main__":
main()