"""Numeric block — inference. Loads the trained regressor and classifier and returns: - a predicted obesity classification (7-class, with per-class probability), - a predicted BMI from the regression head, - a personalized daily calorie target derived from Mifflin-St Jeor. The CV-derived ``high_caloric_meal`` flag overrides the user's self-reported ``FAVC`` feature before inference. This is the load-bearing integration point between the computer-vision block and the numeric classifier. """ from __future__ import annotations import json from pathlib import Path from typing import Any import joblib import numpy as np import pandas as pd from .obesity import OBESITY_LEVELS, apply_favc_override from .profile import ( TRAINING_AGE_RANGE, TRAINING_BMI_RANGE, TRAINING_HEIGHT_RANGE_CM, TRAINING_WEIGHT_RANGE_KG, bmi_to_band, daily_target_kcal, ) # Severity gap (in class indices) at which we override the classifier # probabilities with the BMI-band rule. Adjacent-class disagreements # (gap=1) are tolerated — those are genuinely borderline cases. A gap of # 2+ means the model is materially wrong about severity, typically # because the conditional (gender, BMI) combination is rare in training. _OOD_SEVERITY_GAP = 2 # Mixing weight used when blending the classifier's probabilities with # a one-hot prior on the BMI-band class for OOD inputs. _OOD_BAND_PRIOR_WEIGHT = 0.7 MODELS_DIR = Path(__file__).resolve().parents[2] / "models" _state: dict[str, Any] = {} def _load() -> dict[str, Any]: if _state: return _state reg_path = MODELS_DIR / "numeric_regressor.pkl" clf_path = MODELS_DIR / "numeric_classifier.pkl" enc_path = MODELS_DIR / "numeric_label_encoder.pkl" meta_path = MODELS_DIR / "numeric_metadata.json" if not (reg_path.exists() and clf_path.exists() and meta_path.exists()): return {} _state["regressor"] = joblib.load(reg_path) _state["classifier"] = joblib.load(clf_path) _state["label_encoder"] = joblib.load(enc_path) if enc_path.exists() else None _state["metadata"] = json.loads(meta_path.read_text()) return _state def _detect_anomalies( weight_kg: float, height_cm: float, age: int, bmi_raw: float, bmi_band: str, model_class: str, gender: str, ) -> list[str]: """Flag profile inputs that fall outside the classifier's training distribution. The UCI Obesity Levels dataset is a small (n=2111), partly synthetic sample with strong conditional skew (e.g., Obesity_Type_III is 99.7% Female). We surface anomalies so the UI can warn the user before treating the classifier output as gospel. """ flags: list[str] = [] wmin, wmax = TRAINING_WEIGHT_RANGE_KG if weight_kg < wmin: flags.append(f"Weight {weight_kg:.0f} kg is below the trained range ({wmin:.0f}–{wmax:.0f} kg).") elif weight_kg > wmax: flags.append(f"Weight {weight_kg:.0f} kg is above the trained range ({wmin:.0f}–{wmax:.0f} kg).") hmin, hmax = TRAINING_HEIGHT_RANGE_CM if height_cm < hmin: flags.append(f"Height {height_cm:.0f} cm is below the trained range ({hmin:.0f}–{hmax:.0f} cm).") elif height_cm > hmax: flags.append(f"Height {height_cm:.0f} cm is above the trained range ({hmin:.0f}–{hmax:.0f} cm).") bmin, bmax = TRAINING_BMI_RANGE if bmi_raw < bmin or bmi_raw > bmax: flags.append(f"BMI {bmi_raw:.1f} is outside the trained range ({bmin:.1f}–{bmax:.1f}).") amin, amax = TRAINING_AGE_RANGE if age < amin or age > amax: flags.append(f"Age {age} is outside the trained range ({amin}–{amax} years).") # Severity disagreement (gap in class index) — most telling for the # known Male × Obesity_Type_III gap in training. try: gap = abs(OBESITY_LEVELS.index(bmi_band) - OBESITY_LEVELS.index(model_class)) except ValueError: gap = 0 if gap >= _OOD_SEVERITY_GAP: flags.append( f"Classifier says **{model_class.replace('_', ' ')}** but BMI " f"{bmi_raw:.1f} falls into **{bmi_band.replace('_', ' ')}** " f"— a {gap}-class gap." ) # Conditionally-rare combination: Obesity_Type_III is almost entirely # female in the training data, so a male profile predicted as Type_III # (or whose BMI band is Type_III) is unreliable territory. if bmi_band == "Obesity_Type_III" and gender.lower().startswith("m"): flags.append( "Obesity Type III training data is 99.7% female — male predictions " "at this BMI are extrapolations." ) return flags def _blend_with_bmi_band( proba_by_name: dict[str, float], bmi_band: str, classes: list[str], ) -> dict[str, float]: """Mix the classifier output with a one-hot prior centered on ``bmi_band``. Used only when the classifier disagrees with the BMI rule by a wide margin. The blend is deterministic (no training data needed) and preserves order, so the visible probability bars still tell a coherent story. """ w = _OOD_BAND_PRIOR_WEIGHT blended: dict[str, float] = {} for c in classes: prior = 1.0 if c == bmi_band else 0.0 blended[c] = (1.0 - w) * proba_by_name.get(c, 0.0) + w * prior s = sum(blended.values()) or 1.0 return {c: v / s for c, v in blended.items()} def _build_feature_row(profile: dict, feature_columns: list[str]) -> pd.DataFrame: """Map the user's form input to a single-row DataFrame matching training columns.""" row: dict[str, Any] = {col: 0 for col in feature_columns} numeric = { "Age": float(profile.get("age", 30)), "Height": float(profile.get("height_cm", 170)) / 100.0, "Weight": float(profile.get("weight_kg", 70)), "FCVC": float(profile.get("FCVC", 2.0)), "NCP": float(profile.get("NCP", 3.0)), "CH2O": float(profile.get("CH2O", 2.0)), "FAF": float(profile.get("FAF", 1.0)), "TUE": float(profile.get("TUE", 1.0)), } for k, v in numeric.items(): if k in row: row[k] = v categorical = { "Gender": profile.get("Gender", "Male"), "family_history_with_overweight": profile.get("family_history_with_overweight", "no"), "FAVC": profile.get("FAVC", "no"), "CAEC": profile.get("CAEC", "Sometimes"), "SMOKE": profile.get("SMOKE", "no"), "SCC": profile.get("SCC", "no"), "CALC": profile.get("CALC", "no"), "MTRANS": profile.get("MTRANS", "Public_Transportation"), } for prefix, value in categorical.items(): target_col = f"{prefix}_{value}" if target_col in row: row[target_col] = 1 return pd.DataFrame([row], columns=feature_columns) def predict(profile: dict, nutrition: dict | None = None) -> dict: """Run the numeric pipeline for one user. Parameters ---------- profile : dict Form inputs (Age, Height cm, Weight kg, Gender, habit answers). nutrition : dict | None Output of the CV block. When provided and the meal is high-caloric, the FAVC feature is overridden upstream of the classifier. """ state = _load() profile = dict(profile) profile.setdefault("activity_level", "moderate") profile.setdefault("goal", "maintain") weight_kg = float(profile["weight_kg"]) height_cm = float(profile["height_cm"]) age = int(profile["age"]) gender = profile.get("Gender", "Male") bmi_raw = weight_kg / ((height_cm / 100.0) ** 2) bmi_band = bmi_to_band(bmi_raw) target_kcal = daily_target_kcal( age, weight_kg, height_cm, gender, profile["activity_level"], profile["goal"], ) if not state: return { "obesity_class": bmi_band, "obesity_probabilities": {c: (1.0 if c == bmi_band else 0.0) for c in OBESITY_LEVELS}, "predicted_bmi": round(bmi_raw, 2), "daily_target_kcal": round(target_kcal, 1), "favc_overridden": False, "bmi_raw": round(bmi_raw, 2), "bmi_band": bmi_band, "anomaly_flags": ["Models unavailable — falling back to BMI rule."], "ood_blended": False, "models": {"regressor": "untrained_fallback", "classifier": "untrained_fallback"}, } feature_cols = state["metadata"]["feature_columns"] x = _build_feature_row(profile, feature_cols) original_favc_yes = int(x.get("FAVC_yes", pd.Series([0])).iloc[0]) if "FAVC_yes" in x.columns else 0 x = apply_favc_override(x, nutrition) overridden = "FAVC_yes" in x.columns and int(x["FAVC_yes"].iloc[0]) == 1 and original_favc_yes == 0 bmi_pred = float(state["regressor"].predict(x)[0]) proba = state["classifier"].predict_proba(x)[0] # The label encoder sorts labels alphabetically, so classifier.classes_ is # in alphabetical order — not the severity-ranked order we author in # OBESITY_LEVELS. Map probabilities through the encoder's actual class # names before exposing them, then re-order to the canonical sequence so # downstream UIs render rows in severity order. encoder = state.get("label_encoder") if encoder is not None: encoder_classes = list(encoder.classes_) proba_by_name = { str(encoder_classes[i]): float(proba[i]) for i in range(len(encoder_classes)) } model_class = str(encoder_classes[int(np.argmax(proba))]) else: proba_by_name = { OBESITY_LEVELS[i]: float(proba[i]) for i in range(len(OBESITY_LEVELS)) } model_class = OBESITY_LEVELS[int(np.argmax(proba))] anomaly_flags = _detect_anomalies( weight_kg=weight_kg, height_cm=height_cm, age=age, bmi_raw=bmi_raw, bmi_band=bmi_band, model_class=model_class, gender=gender, ) # Override the classifier when the input is materially OOD: weight/BMI # outside the trained range, a 2-class severity gap with the BMI band, # or a known conditional rarity (Male × Obesity_Type_III: only 1 male # example in 324 Type_III rows). Adjacent-class disagreements without # an OOD signal stay untouched — those are genuinely borderline cases # and the model handles them well. weight_ood = weight_kg < TRAINING_WEIGHT_RANGE_KG[0] or weight_kg > TRAINING_WEIGHT_RANGE_KG[1] bmi_ood = bmi_raw < TRAINING_BMI_RANGE[0] or bmi_raw > TRAINING_BMI_RANGE[1] severity_gap = abs(OBESITY_LEVELS.index(bmi_band) - OBESITY_LEVELS.index(model_class)) male_type3_rarity = ( bmi_band == "Obesity_Type_III" and gender.lower().startswith("m") and model_class != "Obesity_Type_III" ) needs_blend = ( weight_ood or bmi_ood or severity_gap >= _OOD_SEVERITY_GAP or male_type3_rarity ) final_proba = proba_by_name final_class = model_class blended = False if needs_blend: final_proba = _blend_with_bmi_band(proba_by_name, bmi_band, OBESITY_LEVELS) final_class = max(final_proba, key=final_proba.get) blended = True return { "obesity_class": final_class, "obesity_probabilities": {c: final_proba.get(c, 0.0) for c in OBESITY_LEVELS}, "predicted_bmi": round(bmi_pred, 2), "daily_target_kcal": round(target_kcal, 1), "favc_overridden": bool(overridden), "bmi_raw": round(bmi_raw, 2), "bmi_band": bmi_band, "model_class": model_class, "model_probabilities": {c: proba_by_name.get(c, 0.0) for c in OBESITY_LEVELS}, "anomaly_flags": anomaly_flags, "ood_blended": blended, "models": { "regressor": state["metadata"]["regressor"]["name"], "classifier": state["metadata"]["classifier"]["name"], }, }