forkcast / src /numeric /model.py
adisaljusi's picture
fix(numeric): BMI-band fallback for out-of-distribution profiles
0bce3b4
"""Numeric block — inference.
Loads the trained regressor and classifier and returns:
- a predicted obesity classification (7-class, with per-class
probability),
- a predicted BMI from the regression head,
- a personalized daily calorie target derived from Mifflin-St Jeor.
The CV-derived ``high_caloric_meal`` flag overrides the user's
self-reported ``FAVC`` feature before inference. This is the load-bearing
integration point between the computer-vision block and the numeric
classifier.
"""
from __future__ import annotations
import json
from pathlib import Path
from typing import Any
import joblib
import numpy as np
import pandas as pd
from .obesity import OBESITY_LEVELS, apply_favc_override
from .profile import (
TRAINING_AGE_RANGE,
TRAINING_BMI_RANGE,
TRAINING_HEIGHT_RANGE_CM,
TRAINING_WEIGHT_RANGE_KG,
bmi_to_band,
daily_target_kcal,
)
# Severity gap (in class indices) at which we override the classifier
# probabilities with the BMI-band rule. Adjacent-class disagreements
# (gap=1) are tolerated — those are genuinely borderline cases. A gap of
# 2+ means the model is materially wrong about severity, typically
# because the conditional (gender, BMI) combination is rare in training.
_OOD_SEVERITY_GAP = 2
# Mixing weight used when blending the classifier's probabilities with
# a one-hot prior on the BMI-band class for OOD inputs.
_OOD_BAND_PRIOR_WEIGHT = 0.7
MODELS_DIR = Path(__file__).resolve().parents[2] / "models"
_state: dict[str, Any] = {}
def _load() -> dict[str, Any]:
if _state:
return _state
reg_path = MODELS_DIR / "numeric_regressor.pkl"
clf_path = MODELS_DIR / "numeric_classifier.pkl"
enc_path = MODELS_DIR / "numeric_label_encoder.pkl"
meta_path = MODELS_DIR / "numeric_metadata.json"
if not (reg_path.exists() and clf_path.exists() and meta_path.exists()):
return {}
_state["regressor"] = joblib.load(reg_path)
_state["classifier"] = joblib.load(clf_path)
_state["label_encoder"] = joblib.load(enc_path) if enc_path.exists() else None
_state["metadata"] = json.loads(meta_path.read_text())
return _state
def _detect_anomalies(
weight_kg: float,
height_cm: float,
age: int,
bmi_raw: float,
bmi_band: str,
model_class: str,
gender: str,
) -> list[str]:
"""Flag profile inputs that fall outside the classifier's training distribution.
The UCI Obesity Levels dataset is a small (n=2111), partly synthetic
sample with strong conditional skew (e.g., Obesity_Type_III is 99.7%
Female). We surface anomalies so the UI can warn the user before
treating the classifier output as gospel.
"""
flags: list[str] = []
wmin, wmax = TRAINING_WEIGHT_RANGE_KG
if weight_kg < wmin:
flags.append(f"Weight {weight_kg:.0f} kg is below the trained range ({wmin:.0f}{wmax:.0f} kg).")
elif weight_kg > wmax:
flags.append(f"Weight {weight_kg:.0f} kg is above the trained range ({wmin:.0f}{wmax:.0f} kg).")
hmin, hmax = TRAINING_HEIGHT_RANGE_CM
if height_cm < hmin:
flags.append(f"Height {height_cm:.0f} cm is below the trained range ({hmin:.0f}{hmax:.0f} cm).")
elif height_cm > hmax:
flags.append(f"Height {height_cm:.0f} cm is above the trained range ({hmin:.0f}{hmax:.0f} cm).")
bmin, bmax = TRAINING_BMI_RANGE
if bmi_raw < bmin or bmi_raw > bmax:
flags.append(f"BMI {bmi_raw:.1f} is outside the trained range ({bmin:.1f}{bmax:.1f}).")
amin, amax = TRAINING_AGE_RANGE
if age < amin or age > amax:
flags.append(f"Age {age} is outside the trained range ({amin}{amax} years).")
# Severity disagreement (gap in class index) — most telling for the
# known Male × Obesity_Type_III gap in training.
try:
gap = abs(OBESITY_LEVELS.index(bmi_band) - OBESITY_LEVELS.index(model_class))
except ValueError:
gap = 0
if gap >= _OOD_SEVERITY_GAP:
flags.append(
f"Classifier says **{model_class.replace('_', ' ')}** but BMI "
f"{bmi_raw:.1f} falls into **{bmi_band.replace('_', ' ')}** "
f"— a {gap}-class gap."
)
# Conditionally-rare combination: Obesity_Type_III is almost entirely
# female in the training data, so a male profile predicted as Type_III
# (or whose BMI band is Type_III) is unreliable territory.
if bmi_band == "Obesity_Type_III" and gender.lower().startswith("m"):
flags.append(
"Obesity Type III training data is 99.7% female — male predictions "
"at this BMI are extrapolations."
)
return flags
def _blend_with_bmi_band(
proba_by_name: dict[str, float],
bmi_band: str,
classes: list[str],
) -> dict[str, float]:
"""Mix the classifier output with a one-hot prior centered on ``bmi_band``.
Used only when the classifier disagrees with the BMI rule by a wide
margin. The blend is deterministic (no training data needed) and
preserves order, so the visible probability bars still tell a
coherent story.
"""
w = _OOD_BAND_PRIOR_WEIGHT
blended: dict[str, float] = {}
for c in classes:
prior = 1.0 if c == bmi_band else 0.0
blended[c] = (1.0 - w) * proba_by_name.get(c, 0.0) + w * prior
s = sum(blended.values()) or 1.0
return {c: v / s for c, v in blended.items()}
def _build_feature_row(profile: dict, feature_columns: list[str]) -> pd.DataFrame:
"""Map the user's form input to a single-row DataFrame matching training columns."""
row: dict[str, Any] = {col: 0 for col in feature_columns}
numeric = {
"Age": float(profile.get("age", 30)),
"Height": float(profile.get("height_cm", 170)) / 100.0,
"Weight": float(profile.get("weight_kg", 70)),
"FCVC": float(profile.get("FCVC", 2.0)),
"NCP": float(profile.get("NCP", 3.0)),
"CH2O": float(profile.get("CH2O", 2.0)),
"FAF": float(profile.get("FAF", 1.0)),
"TUE": float(profile.get("TUE", 1.0)),
}
for k, v in numeric.items():
if k in row:
row[k] = v
categorical = {
"Gender": profile.get("Gender", "Male"),
"family_history_with_overweight": profile.get("family_history_with_overweight", "no"),
"FAVC": profile.get("FAVC", "no"),
"CAEC": profile.get("CAEC", "Sometimes"),
"SMOKE": profile.get("SMOKE", "no"),
"SCC": profile.get("SCC", "no"),
"CALC": profile.get("CALC", "no"),
"MTRANS": profile.get("MTRANS", "Public_Transportation"),
}
for prefix, value in categorical.items():
target_col = f"{prefix}_{value}"
if target_col in row:
row[target_col] = 1
return pd.DataFrame([row], columns=feature_columns)
def predict(profile: dict, nutrition: dict | None = None) -> dict:
"""Run the numeric pipeline for one user.
Parameters
----------
profile : dict
Form inputs (Age, Height cm, Weight kg, Gender, habit answers).
nutrition : dict | None
Output of the CV block. When provided and the meal is
high-caloric, the FAVC feature is overridden upstream of the
classifier.
"""
state = _load()
profile = dict(profile)
profile.setdefault("activity_level", "moderate")
profile.setdefault("goal", "maintain")
weight_kg = float(profile["weight_kg"])
height_cm = float(profile["height_cm"])
age = int(profile["age"])
gender = profile.get("Gender", "Male")
bmi_raw = weight_kg / ((height_cm / 100.0) ** 2)
bmi_band = bmi_to_band(bmi_raw)
target_kcal = daily_target_kcal(
age, weight_kg, height_cm, gender, profile["activity_level"], profile["goal"],
)
if not state:
return {
"obesity_class": bmi_band,
"obesity_probabilities": {c: (1.0 if c == bmi_band else 0.0) for c in OBESITY_LEVELS},
"predicted_bmi": round(bmi_raw, 2),
"daily_target_kcal": round(target_kcal, 1),
"favc_overridden": False,
"bmi_raw": round(bmi_raw, 2),
"bmi_band": bmi_band,
"anomaly_flags": ["Models unavailable — falling back to BMI rule."],
"ood_blended": False,
"models": {"regressor": "untrained_fallback", "classifier": "untrained_fallback"},
}
feature_cols = state["metadata"]["feature_columns"]
x = _build_feature_row(profile, feature_cols)
original_favc_yes = int(x.get("FAVC_yes", pd.Series([0])).iloc[0]) if "FAVC_yes" in x.columns else 0
x = apply_favc_override(x, nutrition)
overridden = "FAVC_yes" in x.columns and int(x["FAVC_yes"].iloc[0]) == 1 and original_favc_yes == 0
bmi_pred = float(state["regressor"].predict(x)[0])
proba = state["classifier"].predict_proba(x)[0]
# The label encoder sorts labels alphabetically, so classifier.classes_ is
# in alphabetical order — not the severity-ranked order we author in
# OBESITY_LEVELS. Map probabilities through the encoder's actual class
# names before exposing them, then re-order to the canonical sequence so
# downstream UIs render rows in severity order.
encoder = state.get("label_encoder")
if encoder is not None:
encoder_classes = list(encoder.classes_)
proba_by_name = {
str(encoder_classes[i]): float(proba[i]) for i in range(len(encoder_classes))
}
model_class = str(encoder_classes[int(np.argmax(proba))])
else:
proba_by_name = {
OBESITY_LEVELS[i]: float(proba[i]) for i in range(len(OBESITY_LEVELS))
}
model_class = OBESITY_LEVELS[int(np.argmax(proba))]
anomaly_flags = _detect_anomalies(
weight_kg=weight_kg, height_cm=height_cm, age=age,
bmi_raw=bmi_raw, bmi_band=bmi_band, model_class=model_class, gender=gender,
)
# Override the classifier when the input is materially OOD: weight/BMI
# outside the trained range, a 2-class severity gap with the BMI band,
# or a known conditional rarity (Male × Obesity_Type_III: only 1 male
# example in 324 Type_III rows). Adjacent-class disagreements without
# an OOD signal stay untouched — those are genuinely borderline cases
# and the model handles them well.
weight_ood = weight_kg < TRAINING_WEIGHT_RANGE_KG[0] or weight_kg > TRAINING_WEIGHT_RANGE_KG[1]
bmi_ood = bmi_raw < TRAINING_BMI_RANGE[0] or bmi_raw > TRAINING_BMI_RANGE[1]
severity_gap = abs(OBESITY_LEVELS.index(bmi_band) - OBESITY_LEVELS.index(model_class))
male_type3_rarity = (
bmi_band == "Obesity_Type_III"
and gender.lower().startswith("m")
and model_class != "Obesity_Type_III"
)
needs_blend = (
weight_ood
or bmi_ood
or severity_gap >= _OOD_SEVERITY_GAP
or male_type3_rarity
)
final_proba = proba_by_name
final_class = model_class
blended = False
if needs_blend:
final_proba = _blend_with_bmi_band(proba_by_name, bmi_band, OBESITY_LEVELS)
final_class = max(final_proba, key=final_proba.get)
blended = True
return {
"obesity_class": final_class,
"obesity_probabilities": {c: final_proba.get(c, 0.0) for c in OBESITY_LEVELS},
"predicted_bmi": round(bmi_pred, 2),
"daily_target_kcal": round(target_kcal, 1),
"favc_overridden": bool(overridden),
"bmi_raw": round(bmi_raw, 2),
"bmi_band": bmi_band,
"model_class": model_class,
"model_probabilities": {c: proba_by_name.get(c, 0.0) for c in OBESITY_LEVELS},
"anomaly_flags": anomaly_flags,
"ood_blended": blended,
"models": {
"regressor": state["metadata"]["regressor"]["name"],
"classifier": state["metadata"]["classifier"]["name"],
},
}