Spaces:

adisaljusi
/

forkcast

Sleeping

File size: 11,831 Bytes

"""Numeric block — inference.

Loads the trained regressor and classifier and returns:

- a predicted obesity classification (7-class, with per-class
  probability),
- a predicted BMI from the regression head,
- a personalized daily calorie target derived from Mifflin-St Jeor.

The CV-derived ``high_caloric_meal`` flag overrides the user's
self-reported ``FAVC`` feature before inference. This is the load-bearing
integration point between the computer-vision block and the numeric
classifier.
"""

from __future__ import annotations

import json
from pathlib import Path
from typing import Any

import joblib
import numpy as np
import pandas as pd

from .obesity import OBESITY_LEVELS, apply_favc_override
from .profile import (
    TRAINING_AGE_RANGE,
    TRAINING_BMI_RANGE,
    TRAINING_HEIGHT_RANGE_CM,
    TRAINING_WEIGHT_RANGE_KG,
    bmi_to_band,
    daily_target_kcal,
)

# Severity gap (in class indices) at which we override the classifier
# probabilities with the BMI-band rule. Adjacent-class disagreements
# (gap=1) are tolerated — those are genuinely borderline cases. A gap of
# 2+ means the model is materially wrong about severity, typically
# because the conditional (gender, BMI) combination is rare in training.
_OOD_SEVERITY_GAP = 2

# Mixing weight used when blending the classifier's probabilities with
# a one-hot prior on the BMI-band class for OOD inputs.
_OOD_BAND_PRIOR_WEIGHT = 0.7

MODELS_DIR = Path(__file__).resolve().parents[2] / "models"

_state: dict[str, Any] = {}


def _load() -> dict[str, Any]:
    if _state:
        return _state
    reg_path = MODELS_DIR / "numeric_regressor.pkl"
    clf_path = MODELS_DIR / "numeric_classifier.pkl"
    enc_path = MODELS_DIR / "numeric_label_encoder.pkl"
    meta_path = MODELS_DIR / "numeric_metadata.json"
    if not (reg_path.exists() and clf_path.exists() and meta_path.exists()):
        return {}
    _state["regressor"] = joblib.load(reg_path)
    _state["classifier"] = joblib.load(clf_path)
    _state["label_encoder"] = joblib.load(enc_path) if enc_path.exists() else None
    _state["metadata"] = json.loads(meta_path.read_text())
    return _state


def _detect_anomalies(
    weight_kg: float,
    height_cm: float,
    age: int,
    bmi_raw: float,
    bmi_band: str,
    model_class: str,
    gender: str,
) -> list[str]:
    """Flag profile inputs that fall outside the classifier's training distribution.

    The UCI Obesity Levels dataset is a small (n=2111), partly synthetic
    sample with strong conditional skew (e.g., Obesity_Type_III is 99.7%
    Female). We surface anomalies so the UI can warn the user before
    treating the classifier output as gospel.
    """
    flags: list[str] = []

    wmin, wmax = TRAINING_WEIGHT_RANGE_KG
    if weight_kg < wmin:
        flags.append(f"Weight {weight_kg:.0f} kg is below the trained range ({wmin:.0f}–{wmax:.0f} kg).")
    elif weight_kg > wmax:
        flags.append(f"Weight {weight_kg:.0f} kg is above the trained range ({wmin:.0f}–{wmax:.0f} kg).")

    hmin, hmax = TRAINING_HEIGHT_RANGE_CM
    if height_cm < hmin:
        flags.append(f"Height {height_cm:.0f} cm is below the trained range ({hmin:.0f}–{hmax:.0f} cm).")
    elif height_cm > hmax:
        flags.append(f"Height {height_cm:.0f} cm is above the trained range ({hmin:.0f}–{hmax:.0f} cm).")

    bmin, bmax = TRAINING_BMI_RANGE
    if bmi_raw < bmin or bmi_raw > bmax:
        flags.append(f"BMI {bmi_raw:.1f} is outside the trained range ({bmin:.1f}–{bmax:.1f}).")

    amin, amax = TRAINING_AGE_RANGE
    if age < amin or age > amax:
        flags.append(f"Age {age} is outside the trained range ({amin}–{amax} years).")

    # Severity disagreement (gap in class index) — most telling for the
    # known Male × Obesity_Type_III gap in training.
    try:
        gap = abs(OBESITY_LEVELS.index(bmi_band) - OBESITY_LEVELS.index(model_class))
    except ValueError:
        gap = 0
    if gap >= _OOD_SEVERITY_GAP:
        flags.append(
            f"Classifier says **{model_class.replace('_', ' ')}** but BMI "
            f"{bmi_raw:.1f} falls into **{bmi_band.replace('_', ' ')}** "
            f"— a {gap}-class gap."
        )

    # Conditionally-rare combination: Obesity_Type_III is almost entirely
    # female in the training data, so a male profile predicted as Type_III
    # (or whose BMI band is Type_III) is unreliable territory.
    if bmi_band == "Obesity_Type_III" and gender.lower().startswith("m"):
        flags.append(
            "Obesity Type III training data is 99.7% female — male predictions "
            "at this BMI are extrapolations."
        )

    return flags


def _blend_with_bmi_band(
    proba_by_name: dict[str, float],
    bmi_band: str,
    classes: list[str],
) -> dict[str, float]:
    """Mix the classifier output with a one-hot prior centered on ``bmi_band``.

    Used only when the classifier disagrees with the BMI rule by a wide
    margin. The blend is deterministic (no training data needed) and
    preserves order, so the visible probability bars still tell a
    coherent story.
    """
    w = _OOD_BAND_PRIOR_WEIGHT
    blended: dict[str, float] = {}
    for c in classes:
        prior = 1.0 if c == bmi_band else 0.0
        blended[c] = (1.0 - w) * proba_by_name.get(c, 0.0) + w * prior
    s = sum(blended.values()) or 1.0
    return {c: v / s for c, v in blended.items()}


def _build_feature_row(profile: dict, feature_columns: list[str]) -> pd.DataFrame:
    """Map the user's form input to a single-row DataFrame matching training columns."""
    row: dict[str, Any] = {col: 0 for col in feature_columns}

    numeric = {
        "Age": float(profile.get("age", 30)),
        "Height": float(profile.get("height_cm", 170)) / 100.0,
        "Weight": float(profile.get("weight_kg", 70)),
        "FCVC": float(profile.get("FCVC", 2.0)),
        "NCP": float(profile.get("NCP", 3.0)),
        "CH2O": float(profile.get("CH2O", 2.0)),
        "FAF": float(profile.get("FAF", 1.0)),
        "TUE": float(profile.get("TUE", 1.0)),
    }
    for k, v in numeric.items():
        if k in row:
            row[k] = v

    categorical = {
        "Gender": profile.get("Gender", "Male"),
        "family_history_with_overweight": profile.get("family_history_with_overweight", "no"),
        "FAVC": profile.get("FAVC", "no"),
        "CAEC": profile.get("CAEC", "Sometimes"),
        "SMOKE": profile.get("SMOKE", "no"),
        "SCC": profile.get("SCC", "no"),
        "CALC": profile.get("CALC", "no"),
        "MTRANS": profile.get("MTRANS", "Public_Transportation"),
    }
    for prefix, value in categorical.items():
        target_col = f"{prefix}_{value}"
        if target_col in row:
            row[target_col] = 1

    return pd.DataFrame([row], columns=feature_columns)


def predict(profile: dict, nutrition: dict | None = None) -> dict:
    """Run the numeric pipeline for one user.

    Parameters
    ----------
    profile : dict
        Form inputs (Age, Height cm, Weight kg, Gender, habit answers).
    nutrition : dict | None
        Output of the CV block. When provided and the meal is
        high-caloric, the FAVC feature is overridden upstream of the
        classifier.
    """
    state = _load()
    profile = dict(profile)
    profile.setdefault("activity_level", "moderate")
    profile.setdefault("goal", "maintain")

    weight_kg = float(profile["weight_kg"])
    height_cm = float(profile["height_cm"])
    age = int(profile["age"])
    gender = profile.get("Gender", "Male")
    bmi_raw = weight_kg / ((height_cm / 100.0) ** 2)
    bmi_band = bmi_to_band(bmi_raw)

    target_kcal = daily_target_kcal(
        age, weight_kg, height_cm, gender, profile["activity_level"], profile["goal"],
    )

    if not state:
        return {
            "obesity_class": bmi_band,
            "obesity_probabilities": {c: (1.0 if c == bmi_band else 0.0) for c in OBESITY_LEVELS},
            "predicted_bmi": round(bmi_raw, 2),
            "daily_target_kcal": round(target_kcal, 1),
            "favc_overridden": False,
            "bmi_raw": round(bmi_raw, 2),
            "bmi_band": bmi_band,
            "anomaly_flags": ["Models unavailable — falling back to BMI rule."],
            "ood_blended": False,
            "models": {"regressor": "untrained_fallback", "classifier": "untrained_fallback"},
        }

    feature_cols = state["metadata"]["feature_columns"]
    x = _build_feature_row(profile, feature_cols)
    original_favc_yes = int(x.get("FAVC_yes", pd.Series([0])).iloc[0]) if "FAVC_yes" in x.columns else 0
    x = apply_favc_override(x, nutrition)
    overridden = "FAVC_yes" in x.columns and int(x["FAVC_yes"].iloc[0]) == 1 and original_favc_yes == 0

    bmi_pred = float(state["regressor"].predict(x)[0])
    proba = state["classifier"].predict_proba(x)[0]

    # The label encoder sorts labels alphabetically, so classifier.classes_ is
    # in alphabetical order — not the severity-ranked order we author in
    # OBESITY_LEVELS. Map probabilities through the encoder's actual class
    # names before exposing them, then re-order to the canonical sequence so
    # downstream UIs render rows in severity order.
    encoder = state.get("label_encoder")
    if encoder is not None:
        encoder_classes = list(encoder.classes_)
        proba_by_name = {
            str(encoder_classes[i]): float(proba[i]) for i in range(len(encoder_classes))
        }
        model_class = str(encoder_classes[int(np.argmax(proba))])
    else:
        proba_by_name = {
            OBESITY_LEVELS[i]: float(proba[i]) for i in range(len(OBESITY_LEVELS))
        }
        model_class = OBESITY_LEVELS[int(np.argmax(proba))]

    anomaly_flags = _detect_anomalies(
        weight_kg=weight_kg, height_cm=height_cm, age=age,
        bmi_raw=bmi_raw, bmi_band=bmi_band, model_class=model_class, gender=gender,
    )

    # Override the classifier when the input is materially OOD: weight/BMI
    # outside the trained range, a 2-class severity gap with the BMI band,
    # or a known conditional rarity (Male × Obesity_Type_III: only 1 male
    # example in 324 Type_III rows). Adjacent-class disagreements without
    # an OOD signal stay untouched — those are genuinely borderline cases
    # and the model handles them well.
    weight_ood = weight_kg < TRAINING_WEIGHT_RANGE_KG[0] or weight_kg > TRAINING_WEIGHT_RANGE_KG[1]
    bmi_ood = bmi_raw < TRAINING_BMI_RANGE[0] or bmi_raw > TRAINING_BMI_RANGE[1]
    severity_gap = abs(OBESITY_LEVELS.index(bmi_band) - OBESITY_LEVELS.index(model_class))
    male_type3_rarity = (
        bmi_band == "Obesity_Type_III"
        and gender.lower().startswith("m")
        and model_class != "Obesity_Type_III"
    )
    needs_blend = (
        weight_ood
        or bmi_ood
        or severity_gap >= _OOD_SEVERITY_GAP
        or male_type3_rarity
    )

    final_proba = proba_by_name
    final_class = model_class
    blended = False
    if needs_blend:
        final_proba = _blend_with_bmi_band(proba_by_name, bmi_band, OBESITY_LEVELS)
        final_class = max(final_proba, key=final_proba.get)
        blended = True

    return {
        "obesity_class": final_class,
        "obesity_probabilities": {c: final_proba.get(c, 0.0) for c in OBESITY_LEVELS},
        "predicted_bmi": round(bmi_pred, 2),
        "daily_target_kcal": round(target_kcal, 1),
        "favc_overridden": bool(overridden),
        "bmi_raw": round(bmi_raw, 2),
        "bmi_band": bmi_band,
        "model_class": model_class,
        "model_probabilities": {c: proba_by_name.get(c, 0.0) for c in OBESITY_LEVELS},
        "anomaly_flags": anomaly_flags,
        "ood_blended": blended,
        "models": {
            "regressor": state["metadata"]["regressor"]["name"],
            "classifier": state["metadata"]["classifier"]["name"],
        },
    }