File size: 11,831 Bytes
bf68d2b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0bce3b4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bf68d2b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0bce3b4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bf68d2b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0bce3b4
 
 
 
 
 
 
 
bf68d2b
0bce3b4
bf68d2b
 
 
 
0bce3b4
 
 
bf68d2b
 
0bce3b4
 
 
 
bf68d2b
 
 
 
 
 
 
 
 
 
 
1e3c549
 
 
 
 
 
 
 
 
 
 
 
0bce3b4
1e3c549
 
 
 
0bce3b4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
bf68d2b
 
0bce3b4
 
bf68d2b
 
 
0bce3b4
 
 
 
 
 
bf68d2b
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
"""Numeric block β€” inference.

Loads the trained regressor and classifier and returns:

- a predicted obesity classification (7-class, with per-class
  probability),
- a predicted BMI from the regression head,
- a personalized daily calorie target derived from Mifflin-St Jeor.

The CV-derived ``high_caloric_meal`` flag overrides the user's
self-reported ``FAVC`` feature before inference. This is the load-bearing
integration point between the computer-vision block and the numeric
classifier.
"""

from __future__ import annotations

import json
from pathlib import Path
from typing import Any

import joblib
import numpy as np
import pandas as pd

from .obesity import OBESITY_LEVELS, apply_favc_override
from .profile import (
    TRAINING_AGE_RANGE,
    TRAINING_BMI_RANGE,
    TRAINING_HEIGHT_RANGE_CM,
    TRAINING_WEIGHT_RANGE_KG,
    bmi_to_band,
    daily_target_kcal,
)

# Severity gap (in class indices) at which we override the classifier
# probabilities with the BMI-band rule. Adjacent-class disagreements
# (gap=1) are tolerated β€” those are genuinely borderline cases. A gap of
# 2+ means the model is materially wrong about severity, typically
# because the conditional (gender, BMI) combination is rare in training.
_OOD_SEVERITY_GAP = 2

# Mixing weight used when blending the classifier's probabilities with
# a one-hot prior on the BMI-band class for OOD inputs.
_OOD_BAND_PRIOR_WEIGHT = 0.7

MODELS_DIR = Path(__file__).resolve().parents[2] / "models"

_state: dict[str, Any] = {}


def _load() -> dict[str, Any]:
    if _state:
        return _state
    reg_path = MODELS_DIR / "numeric_regressor.pkl"
    clf_path = MODELS_DIR / "numeric_classifier.pkl"
    enc_path = MODELS_DIR / "numeric_label_encoder.pkl"
    meta_path = MODELS_DIR / "numeric_metadata.json"
    if not (reg_path.exists() and clf_path.exists() and meta_path.exists()):
        return {}
    _state["regressor"] = joblib.load(reg_path)
    _state["classifier"] = joblib.load(clf_path)
    _state["label_encoder"] = joblib.load(enc_path) if enc_path.exists() else None
    _state["metadata"] = json.loads(meta_path.read_text())
    return _state


def _detect_anomalies(
    weight_kg: float,
    height_cm: float,
    age: int,
    bmi_raw: float,
    bmi_band: str,
    model_class: str,
    gender: str,
) -> list[str]:
    """Flag profile inputs that fall outside the classifier's training distribution.

    The UCI Obesity Levels dataset is a small (n=2111), partly synthetic
    sample with strong conditional skew (e.g., Obesity_Type_III is 99.7%
    Female). We surface anomalies so the UI can warn the user before
    treating the classifier output as gospel.
    """
    flags: list[str] = []

    wmin, wmax = TRAINING_WEIGHT_RANGE_KG
    if weight_kg < wmin:
        flags.append(f"Weight {weight_kg:.0f} kg is below the trained range ({wmin:.0f}–{wmax:.0f} kg).")
    elif weight_kg > wmax:
        flags.append(f"Weight {weight_kg:.0f} kg is above the trained range ({wmin:.0f}–{wmax:.0f} kg).")

    hmin, hmax = TRAINING_HEIGHT_RANGE_CM
    if height_cm < hmin:
        flags.append(f"Height {height_cm:.0f} cm is below the trained range ({hmin:.0f}–{hmax:.0f} cm).")
    elif height_cm > hmax:
        flags.append(f"Height {height_cm:.0f} cm is above the trained range ({hmin:.0f}–{hmax:.0f} cm).")

    bmin, bmax = TRAINING_BMI_RANGE
    if bmi_raw < bmin or bmi_raw > bmax:
        flags.append(f"BMI {bmi_raw:.1f} is outside the trained range ({bmin:.1f}–{bmax:.1f}).")

    amin, amax = TRAINING_AGE_RANGE
    if age < amin or age > amax:
        flags.append(f"Age {age} is outside the trained range ({amin}–{amax} years).")

    # Severity disagreement (gap in class index) β€” most telling for the
    # known Male Γ— Obesity_Type_III gap in training.
    try:
        gap = abs(OBESITY_LEVELS.index(bmi_band) - OBESITY_LEVELS.index(model_class))
    except ValueError:
        gap = 0
    if gap >= _OOD_SEVERITY_GAP:
        flags.append(
            f"Classifier says **{model_class.replace('_', ' ')}** but BMI "
            f"{bmi_raw:.1f} falls into **{bmi_band.replace('_', ' ')}** "
            f"β€” a {gap}-class gap."
        )

    # Conditionally-rare combination: Obesity_Type_III is almost entirely
    # female in the training data, so a male profile predicted as Type_III
    # (or whose BMI band is Type_III) is unreliable territory.
    if bmi_band == "Obesity_Type_III" and gender.lower().startswith("m"):
        flags.append(
            "Obesity Type III training data is 99.7% female β€” male predictions "
            "at this BMI are extrapolations."
        )

    return flags


def _blend_with_bmi_band(
    proba_by_name: dict[str, float],
    bmi_band: str,
    classes: list[str],
) -> dict[str, float]:
    """Mix the classifier output with a one-hot prior centered on ``bmi_band``.

    Used only when the classifier disagrees with the BMI rule by a wide
    margin. The blend is deterministic (no training data needed) and
    preserves order, so the visible probability bars still tell a
    coherent story.
    """
    w = _OOD_BAND_PRIOR_WEIGHT
    blended: dict[str, float] = {}
    for c in classes:
        prior = 1.0 if c == bmi_band else 0.0
        blended[c] = (1.0 - w) * proba_by_name.get(c, 0.0) + w * prior
    s = sum(blended.values()) or 1.0
    return {c: v / s for c, v in blended.items()}


def _build_feature_row(profile: dict, feature_columns: list[str]) -> pd.DataFrame:
    """Map the user's form input to a single-row DataFrame matching training columns."""
    row: dict[str, Any] = {col: 0 for col in feature_columns}

    numeric = {
        "Age": float(profile.get("age", 30)),
        "Height": float(profile.get("height_cm", 170)) / 100.0,
        "Weight": float(profile.get("weight_kg", 70)),
        "FCVC": float(profile.get("FCVC", 2.0)),
        "NCP": float(profile.get("NCP", 3.0)),
        "CH2O": float(profile.get("CH2O", 2.0)),
        "FAF": float(profile.get("FAF", 1.0)),
        "TUE": float(profile.get("TUE", 1.0)),
    }
    for k, v in numeric.items():
        if k in row:
            row[k] = v

    categorical = {
        "Gender": profile.get("Gender", "Male"),
        "family_history_with_overweight": profile.get("family_history_with_overweight", "no"),
        "FAVC": profile.get("FAVC", "no"),
        "CAEC": profile.get("CAEC", "Sometimes"),
        "SMOKE": profile.get("SMOKE", "no"),
        "SCC": profile.get("SCC", "no"),
        "CALC": profile.get("CALC", "no"),
        "MTRANS": profile.get("MTRANS", "Public_Transportation"),
    }
    for prefix, value in categorical.items():
        target_col = f"{prefix}_{value}"
        if target_col in row:
            row[target_col] = 1

    return pd.DataFrame([row], columns=feature_columns)


def predict(profile: dict, nutrition: dict | None = None) -> dict:
    """Run the numeric pipeline for one user.

    Parameters
    ----------
    profile : dict
        Form inputs (Age, Height cm, Weight kg, Gender, habit answers).
    nutrition : dict | None
        Output of the CV block. When provided and the meal is
        high-caloric, the FAVC feature is overridden upstream of the
        classifier.
    """
    state = _load()
    profile = dict(profile)
    profile.setdefault("activity_level", "moderate")
    profile.setdefault("goal", "maintain")

    weight_kg = float(profile["weight_kg"])
    height_cm = float(profile["height_cm"])
    age = int(profile["age"])
    gender = profile.get("Gender", "Male")
    bmi_raw = weight_kg / ((height_cm / 100.0) ** 2)
    bmi_band = bmi_to_band(bmi_raw)

    target_kcal = daily_target_kcal(
        age, weight_kg, height_cm, gender, profile["activity_level"], profile["goal"],
    )

    if not state:
        return {
            "obesity_class": bmi_band,
            "obesity_probabilities": {c: (1.0 if c == bmi_band else 0.0) for c in OBESITY_LEVELS},
            "predicted_bmi": round(bmi_raw, 2),
            "daily_target_kcal": round(target_kcal, 1),
            "favc_overridden": False,
            "bmi_raw": round(bmi_raw, 2),
            "bmi_band": bmi_band,
            "anomaly_flags": ["Models unavailable β€” falling back to BMI rule."],
            "ood_blended": False,
            "models": {"regressor": "untrained_fallback", "classifier": "untrained_fallback"},
        }

    feature_cols = state["metadata"]["feature_columns"]
    x = _build_feature_row(profile, feature_cols)
    original_favc_yes = int(x.get("FAVC_yes", pd.Series([0])).iloc[0]) if "FAVC_yes" in x.columns else 0
    x = apply_favc_override(x, nutrition)
    overridden = "FAVC_yes" in x.columns and int(x["FAVC_yes"].iloc[0]) == 1 and original_favc_yes == 0

    bmi_pred = float(state["regressor"].predict(x)[0])
    proba = state["classifier"].predict_proba(x)[0]

    # The label encoder sorts labels alphabetically, so classifier.classes_ is
    # in alphabetical order β€” not the severity-ranked order we author in
    # OBESITY_LEVELS. Map probabilities through the encoder's actual class
    # names before exposing them, then re-order to the canonical sequence so
    # downstream UIs render rows in severity order.
    encoder = state.get("label_encoder")
    if encoder is not None:
        encoder_classes = list(encoder.classes_)
        proba_by_name = {
            str(encoder_classes[i]): float(proba[i]) for i in range(len(encoder_classes))
        }
        model_class = str(encoder_classes[int(np.argmax(proba))])
    else:
        proba_by_name = {
            OBESITY_LEVELS[i]: float(proba[i]) for i in range(len(OBESITY_LEVELS))
        }
        model_class = OBESITY_LEVELS[int(np.argmax(proba))]

    anomaly_flags = _detect_anomalies(
        weight_kg=weight_kg, height_cm=height_cm, age=age,
        bmi_raw=bmi_raw, bmi_band=bmi_band, model_class=model_class, gender=gender,
    )

    # Override the classifier when the input is materially OOD: weight/BMI
    # outside the trained range, a 2-class severity gap with the BMI band,
    # or a known conditional rarity (Male Γ— Obesity_Type_III: only 1 male
    # example in 324 Type_III rows). Adjacent-class disagreements without
    # an OOD signal stay untouched β€” those are genuinely borderline cases
    # and the model handles them well.
    weight_ood = weight_kg < TRAINING_WEIGHT_RANGE_KG[0] or weight_kg > TRAINING_WEIGHT_RANGE_KG[1]
    bmi_ood = bmi_raw < TRAINING_BMI_RANGE[0] or bmi_raw > TRAINING_BMI_RANGE[1]
    severity_gap = abs(OBESITY_LEVELS.index(bmi_band) - OBESITY_LEVELS.index(model_class))
    male_type3_rarity = (
        bmi_band == "Obesity_Type_III"
        and gender.lower().startswith("m")
        and model_class != "Obesity_Type_III"
    )
    needs_blend = (
        weight_ood
        or bmi_ood
        or severity_gap >= _OOD_SEVERITY_GAP
        or male_type3_rarity
    )

    final_proba = proba_by_name
    final_class = model_class
    blended = False
    if needs_blend:
        final_proba = _blend_with_bmi_band(proba_by_name, bmi_band, OBESITY_LEVELS)
        final_class = max(final_proba, key=final_proba.get)
        blended = True

    return {
        "obesity_class": final_class,
        "obesity_probabilities": {c: final_proba.get(c, 0.0) for c in OBESITY_LEVELS},
        "predicted_bmi": round(bmi_pred, 2),
        "daily_target_kcal": round(target_kcal, 1),
        "favc_overridden": bool(overridden),
        "bmi_raw": round(bmi_raw, 2),
        "bmi_band": bmi_band,
        "model_class": model_class,
        "model_probabilities": {c: proba_by_name.get(c, 0.0) for c in OBESITY_LEVELS},
        "anomaly_flags": anomaly_flags,
        "ood_blended": blended,
        "models": {
            "regressor": state["metadata"]["regressor"]["name"],
            "classifier": state["metadata"]["classifier"]["name"],
        },
    }