Spaces:

adisaljusi
/

forkcast

Sleeping

App Files Files Community

forkcast / src /numeric /model.py

adisaljusi

fix(numeric): BMI-band fallback for out-of-distribution profiles

0bce3b4 5 days ago

raw

history blame contribute delete

11.8 kB

	"""Numeric block — inference.

	Loads the trained regressor and classifier and returns:

	- a predicted obesity classification (7-class, with per-class
	probability),
	- a predicted BMI from the regression head,
	- a personalized daily calorie target derived from Mifflin-St Jeor.

	The CV-derived ``high_caloric_meal`` flag overrides the user's
	self-reported ``FAVC`` feature before inference. This is the load-bearing
	integration point between the computer-vision block and the numeric
	classifier.
	"""

	from __future__ import annotations

	import json
	from pathlib import Path
	from typing import Any

	import joblib
	import numpy as np
	import pandas as pd

	from .obesity import OBESITY_LEVELS, apply_favc_override
	from .profile import (
	TRAINING_AGE_RANGE,
	TRAINING_BMI_RANGE,
	TRAINING_HEIGHT_RANGE_CM,
	TRAINING_WEIGHT_RANGE_KG,
	bmi_to_band,
	daily_target_kcal,
	)

	# Severity gap (in class indices) at which we override the classifier
	# probabilities with the BMI-band rule. Adjacent-class disagreements
	# (gap=1) are tolerated — those are genuinely borderline cases. A gap of
	# 2+ means the model is materially wrong about severity, typically
	# because the conditional (gender, BMI) combination is rare in training.
	_OOD_SEVERITY_GAP = 2

	# Mixing weight used when blending the classifier's probabilities with
	# a one-hot prior on the BMI-band class for OOD inputs.
	_OOD_BAND_PRIOR_WEIGHT = 0.7

	MODELS_DIR = Path(__file__).resolve().parents[2] / "models"

	_state: dict[str, Any] = {}


	def _load() -> dict[str, Any]:
	if _state:
	return _state
	reg_path = MODELS_DIR / "numeric_regressor.pkl"
	clf_path = MODELS_DIR / "numeric_classifier.pkl"
	enc_path = MODELS_DIR / "numeric_label_encoder.pkl"
	meta_path = MODELS_DIR / "numeric_metadata.json"
	if not (reg_path.exists() and clf_path.exists() and meta_path.exists()):
	return {}
	_state["regressor"] = joblib.load(reg_path)
	_state["classifier"] = joblib.load(clf_path)
	_state["label_encoder"] = joblib.load(enc_path) if enc_path.exists() else None
	_state["metadata"] = json.loads(meta_path.read_text())
	return _state


	def _detect_anomalies(
	weight_kg: float,
	height_cm: float,
	age: int,
	bmi_raw: float,
	bmi_band: str,
	model_class: str,
	gender: str,
	) -> list[str]:
	"""Flag profile inputs that fall outside the classifier's training distribution.

	The UCI Obesity Levels dataset is a small (n=2111), partly synthetic
	sample with strong conditional skew (e.g., Obesity_Type_III is 99.7%
	Female). We surface anomalies so the UI can warn the user before
	treating the classifier output as gospel.
	"""
	flags: list[str] = []

	wmin, wmax = TRAINING_WEIGHT_RANGE_KG
	if weight_kg < wmin:
	flags.append(f"Weight {weight_kg:.0f} kg is below the trained range ({wmin:.0f}–{wmax:.0f} kg).")
	elif weight_kg > wmax:
	flags.append(f"Weight {weight_kg:.0f} kg is above the trained range ({wmin:.0f}–{wmax:.0f} kg).")

	hmin, hmax = TRAINING_HEIGHT_RANGE_CM
	if height_cm < hmin:
	flags.append(f"Height {height_cm:.0f} cm is below the trained range ({hmin:.0f}–{hmax:.0f} cm).")
	elif height_cm > hmax:
	flags.append(f"Height {height_cm:.0f} cm is above the trained range ({hmin:.0f}–{hmax:.0f} cm).")

	bmin, bmax = TRAINING_BMI_RANGE
	if bmi_raw < bmin or bmi_raw > bmax:
	flags.append(f"BMI {bmi_raw:.1f} is outside the trained range ({bmin:.1f}–{bmax:.1f}).")

	amin, amax = TRAINING_AGE_RANGE
	if age < amin or age > amax:
	flags.append(f"Age {age} is outside the trained range ({amin}–{amax} years).")

	# Severity disagreement (gap in class index) — most telling for the
	# known Male × Obesity_Type_III gap in training.
	try:
	gap = abs(OBESITY_LEVELS.index(bmi_band) - OBESITY_LEVELS.index(model_class))
	except ValueError:
	gap = 0
	if gap >= _OOD_SEVERITY_GAP:
	flags.append(
	f"Classifier says {model_class.replace('_', ' ')} but BMI "
	f"{bmi_raw:.1f} falls into {bmi_band.replace('_', ' ')} "
	f"— a {gap}-class gap."
	)

	# Conditionally-rare combination: Obesity_Type_III is almost entirely
	# female in the training data, so a male profile predicted as Type_III
	# (or whose BMI band is Type_III) is unreliable territory.
	if bmi_band == "Obesity_Type_III" and gender.lower().startswith("m"):
	flags.append(
	"Obesity Type III training data is 99.7% female — male predictions "
	"at this BMI are extrapolations."
	)

	return flags


	def _blend_with_bmi_band(
	proba_by_name: dict[str, float],
	bmi_band: str,
	classes: list[str],
	) -> dict[str, float]:
	"""Mix the classifier output with a one-hot prior centered on ``bmi_band``.

	Used only when the classifier disagrees with the BMI rule by a wide
	margin. The blend is deterministic (no training data needed) and
	preserves order, so the visible probability bars still tell a
	coherent story.
	"""
	w = _OOD_BAND_PRIOR_WEIGHT
	blended: dict[str, float] = {}
	for c in classes:
	prior = 1.0 if c == bmi_band else 0.0
	blended[c] = (1.0 - w) * proba_by_name.get(c, 0.0) + w * prior
	s = sum(blended.values()) or 1.0
	return {c: v / s for c, v in blended.items()}


	def _build_feature_row(profile: dict, feature_columns: list[str]) -> pd.DataFrame:
	"""Map the user's form input to a single-row DataFrame matching training columns."""
	row: dict[str, Any] = {col: 0 for col in feature_columns}

	numeric = {
	"Age": float(profile.get("age", 30)),
	"Height": float(profile.get("height_cm", 170)) / 100.0,
	"Weight": float(profile.get("weight_kg", 70)),
	"FCVC": float(profile.get("FCVC", 2.0)),
	"NCP": float(profile.get("NCP", 3.0)),
	"CH2O": float(profile.get("CH2O", 2.0)),
	"FAF": float(profile.get("FAF", 1.0)),
	"TUE": float(profile.get("TUE", 1.0)),
	}
	for k, v in numeric.items():
	if k in row:
	row[k] = v

	categorical = {
	"Gender": profile.get("Gender", "Male"),
	"family_history_with_overweight": profile.get("family_history_with_overweight", "no"),
	"FAVC": profile.get("FAVC", "no"),
	"CAEC": profile.get("CAEC", "Sometimes"),
	"SMOKE": profile.get("SMOKE", "no"),
	"SCC": profile.get("SCC", "no"),
	"CALC": profile.get("CALC", "no"),
	"MTRANS": profile.get("MTRANS", "Public_Transportation"),
	}
	for prefix, value in categorical.items():
	target_col = f"{prefix}_{value}"
	if target_col in row:
	row[target_col] = 1

	return pd.DataFrame([row], columns=feature_columns)


	def predict(profile: dict, nutrition: dict \| None = None) -> dict:
	"""Run the numeric pipeline for one user.

	Parameters
	----------
	profile : dict
	Form inputs (Age, Height cm, Weight kg, Gender, habit answers).
	nutrition : dict \| None
	Output of the CV block. When provided and the meal is
	high-caloric, the FAVC feature is overridden upstream of the
	classifier.
	"""
	state = _load()
	profile = dict(profile)
	profile.setdefault("activity_level", "moderate")
	profile.setdefault("goal", "maintain")

	weight_kg = float(profile["weight_kg"])
	height_cm = float(profile["height_cm"])
	age = int(profile["age"])
	gender = profile.get("Gender", "Male")
	bmi_raw = weight_kg / ((height_cm / 100.0) ** 2)
	bmi_band = bmi_to_band(bmi_raw)

	target_kcal = daily_target_kcal(
	age, weight_kg, height_cm, gender, profile["activity_level"], profile["goal"],
	)

	if not state:
	return {
	"obesity_class": bmi_band,
	"obesity_probabilities": {c: (1.0 if c == bmi_band else 0.0) for c in OBESITY_LEVELS},
	"predicted_bmi": round(bmi_raw, 2),
	"daily_target_kcal": round(target_kcal, 1),
	"favc_overridden": False,
	"bmi_raw": round(bmi_raw, 2),
	"bmi_band": bmi_band,
	"anomaly_flags": ["Models unavailable — falling back to BMI rule."],
	"ood_blended": False,
	"models": {"regressor": "untrained_fallback", "classifier": "untrained_fallback"},
	}

	feature_cols = state["metadata"]["feature_columns"]
	x = _build_feature_row(profile, feature_cols)
	original_favc_yes = int(x.get("FAVC_yes", pd.Series([0])).iloc[0]) if "FAVC_yes" in x.columns else 0
	x = apply_favc_override(x, nutrition)
	overridden = "FAVC_yes" in x.columns and int(x["FAVC_yes"].iloc[0]) == 1 and original_favc_yes == 0

	bmi_pred = float(state["regressor"].predict(x)[0])
	proba = state["classifier"].predict_proba(x)[0]

	# The label encoder sorts labels alphabetically, so classifier.classes_ is
	# in alphabetical order — not the severity-ranked order we author in
	# OBESITY_LEVELS. Map probabilities through the encoder's actual class
	# names before exposing them, then re-order to the canonical sequence so
	# downstream UIs render rows in severity order.
	encoder = state.get("label_encoder")
	if encoder is not None:
	encoder_classes = list(encoder.classes_)
	proba_by_name = {
	str(encoder_classes[i]): float(proba[i]) for i in range(len(encoder_classes))
	}
	model_class = str(encoder_classes[int(np.argmax(proba))])
	else:
	proba_by_name = {
	OBESITY_LEVELS[i]: float(proba[i]) for i in range(len(OBESITY_LEVELS))
	}
	model_class = OBESITY_LEVELS[int(np.argmax(proba))]

	anomaly_flags = _detect_anomalies(
	weight_kg=weight_kg, height_cm=height_cm, age=age,
	bmi_raw=bmi_raw, bmi_band=bmi_band, model_class=model_class, gender=gender,
	)

	# Override the classifier when the input is materially OOD: weight/BMI
	# outside the trained range, a 2-class severity gap with the BMI band,
	# or a known conditional rarity (Male × Obesity_Type_III: only 1 male
	# example in 324 Type_III rows). Adjacent-class disagreements without
	# an OOD signal stay untouched — those are genuinely borderline cases
	# and the model handles them well.
	weight_ood = weight_kg < TRAINING_WEIGHT_RANGE_KG[0] or weight_kg > TRAINING_WEIGHT_RANGE_KG[1]
	bmi_ood = bmi_raw < TRAINING_BMI_RANGE[0] or bmi_raw > TRAINING_BMI_RANGE[1]
	severity_gap = abs(OBESITY_LEVELS.index(bmi_band) - OBESITY_LEVELS.index(model_class))
	male_type3_rarity = (
	bmi_band == "Obesity_Type_III"
	and gender.lower().startswith("m")
	and model_class != "Obesity_Type_III"
	)
	needs_blend = (
	weight_ood
	or bmi_ood
	or severity_gap >= _OOD_SEVERITY_GAP
	or male_type3_rarity
	)

	final_proba = proba_by_name
	final_class = model_class
	blended = False
	if needs_blend:
	final_proba = _blend_with_bmi_band(proba_by_name, bmi_band, OBESITY_LEVELS)
	final_class = max(final_proba, key=final_proba.get)
	blended = True

	return {
	"obesity_class": final_class,
	"obesity_probabilities": {c: final_proba.get(c, 0.0) for c in OBESITY_LEVELS},
	"predicted_bmi": round(bmi_pred, 2),
	"daily_target_kcal": round(target_kcal, 1),
	"favc_overridden": bool(overridden),
	"bmi_raw": round(bmi_raw, 2),
	"bmi_band": bmi_band,
	"model_class": model_class,
	"model_probabilities": {c: proba_by_name.get(c, 0.0) for c in OBESITY_LEVELS},
	"anomaly_flags": anomaly_flags,
	"ood_blended": blended,
	"models": {
	"regressor": state["metadata"]["regressor"]["name"],
	"classifier": state["metadata"]["classifier"]["name"],
	},
	}