amobionovo
/

stroke_risk_xgboost

Joblib

Model card Files Files and versions

xet

Community

amobionovo commited on Aug 17, 2025

Commit

4a71291

verified ·

1 Parent(s): ed7e4f6

Update handler.py

Browse files

Files changed (1) hide show

handler.py +356 -302

handler.py CHANGED Viewed

@@ -1,302 +1,356 @@
-# handler.py — Quantium insights Inference Endpoint (Residence_type canonicalized)
-import os
-import json
-import traceback
-from typing import Any, Dict, List, Tuple
-import joblib
-import numpy as np
-import pandas as pd
-# =========================
-# Feature schema (canonical)
-# =========================
-NUMERIC_COLS = ["age", "avg_glucose_level", "bmi", "hypertension", "heart_disease"]
-# Canonical Residence key uses capital R
-CAT_COLS     = ["gender", "ever_married", "work_type", "smoking_status", "Residence_type"]
-ALL_CANON    = NUMERIC_COLS + CAT_COLS
-# For explain UI ordering (match canonical names)
-EXPLAIN_ORDER = [
-    "age", "avg_glucose_level", "bmi", "hypertension", "heart_disease",
-    "gender", "ever_married", "work_type", "smoking_status", "Residence_type"
-]
-# =========================
-# Utility: dtype coercion
-# =========================
-def _to_int01(x: Any) -> int:
-    if isinstance(x, (bool, np.bool_)):
-        return int(bool(x))
-    try:
-        if isinstance(x, str):
-            s = x.strip().lower()
-            if s in {"1", "true", "t", "yes", "y"}:
-                return 1
-            if s in {"0", "false", "f", "no", "n"}:
-                return 0
-        return int(float(x))
-    except Exception:
-        return 0
-def _coerce_dataframe(rows: List[Dict[str, Any]]) -> pd.DataFrame:
-    """
-    Build a clean DataFrame:
-    - Canonical Residence key is 'Residence_type' (capital R).
-    - Accept 'residence_type' and map it to 'Residence_type' if needed.
-    - Ensure numerics are float64 and 0/1 flags are ints then float64.
-    - Ensure categoricals are plain Python strings (object), no NA.
-    - Also mirror lowercase 'residence_type' for legacy models.
-    """
-    norm_rows: List[Dict[str, Any]] = []
-    for r in rows:
-        r = dict(r or {})
-        # Normalize residence key to capitalized canonical
-        if "Residence_type" not in r and "residence_type" in r:
-            r["Residence_type"] = r["residence_type"]
-        # Keep only canonical columns
-        entry = {k: r.get(k, None) for k in ALL_CANON}
-        norm_rows.append(entry)
-    df = pd.DataFrame(norm_rows, columns=ALL_CANON)
-    # binary flags first
-    for col in ["hypertension", "heart_disease"]:
-        df[col] = df[col].map(_to_int01)
-    # strong numeric coercion
-    for col in ["age", "avg_glucose_level", "bmi"]:
-        df[col] = pd.to_numeric(df[col], errors="coerce")
-    # final cast to float64
-    df[NUMERIC_COLS] = df[NUMERIC_COLS].astype("float64")
-    # categoricals as plain strings, no NA
-    for col in CAT_COLS:
-        df[col] = df[col].where(df[col].notna(), "Unknown")
-        df[col] = df[col].map(lambda v: "Unknown" if v is None else str(v)).astype(object)
-    # Mirror lowercase 'residence_type' for backward compatibility
-    df["residence_type"] = df["Residence_type"].astype(object)
-    return df
-# =========================
-# Safety patches for OHE
-# =========================
-def _iter_estimators(est):
-    yield est
-    # Pipelines
-    if hasattr(est, "named_steps"):
-        for step in est.named_steps.values():
-            yield from _iter_estimators(step)
-    # ColumnTransformer
-    if hasattr(est, "transformers"):
-        for _, tr, _ in est.transformers:
-            yield from _iter_estimators(tr)
-def _numeric_like(x) -> bool:
-    if x is None:
-        return True
-    if isinstance(x, (int, np.integer, float, np.floating)):
-        return True
-    if isinstance(x, str):
-        try:
-            float(x)
-            return True
-        except Exception:
-            return False
-    return False
-def _sanitize_onehot_categories(model):
-    """Coerce OneHotEncoder.categories_ to consistent dtypes to avoid np.isnan crashes."""
-    try:
-        from sklearn.preprocessing import OneHotEncoder  # type: ignore
-    except Exception:
-        OneHotEncoder = None
-    if OneHotEncoder is None:
-        return
-    for node in _iter_estimators(model):
-        if isinstance(node, OneHotEncoder) and hasattr(node, "categories_"):
-            new_cats = []
-            for cats in node.categories_:
-                arr = np.asarray(cats, dtype=object)
-                if all(_numeric_like(v) for v in arr):
-                    vals = []
-                    for v in arr:
-                        try:
-                            vals.append(np.nan if v is None else float(v))
-                        except Exception:
-                            vals.append(np.nan)
-                    new_cats.append(np.asarray(vals, dtype=float))
-                else:
-                    strs = ["Unknown" if (v is None or (isinstance(v, float) and np.isnan(v))) else str(v) for v in arr]
-                    new_cats.append(np.asarray(strs, dtype=object))
-            node.categories_ = new_cats
-            if hasattr(node, "handle_unknown"):
-                node.handle_unknown = "ignore"
-def _patch_check_unknown():
-    """
-    Monkey-patch sklearn.utils._encode._check_unknown to avoid np.isnan on object/string arrays
-    on certain sklearn builds.
-    """
-    try:
-        from sklearn.utils import _encode  # type: ignore
-        _orig = _encode._check_unknown
-        def _safe_check_unknown(values, known_values, return_mask=False):
-            try:
-                return _orig(values, known_values, return_mask=return_mask)
-            except TypeError:
-                vals  = np.asarray(values, dtype=object)
-                known = np.asarray(known_values, dtype=object)
-                mask = np.isin(vals, known, assume_unique=False)
-                diff = vals[~mask]
-                if return_mask:
-                    return diff, mask
-                return diff
-        _encode._check_unknown = _safe_check_unknown  # type: ignore[attr-defined]
-        print("[handler] Patched sklearn.utils._encode._check_unknown", flush=True)
-    except Exception as e:
-        print(f"[handler] Patch for _check_unknown not applied: {e}", flush=True)
-# =========================
-# Model introspection (debug)
-# =========================
-def _introspect_model(model) -> Dict[str, Any]:
-    info: Dict[str, Any] = {"type": str(type(model))}
-    try:
-        if hasattr(model, "named_steps"):
-            info["pipeline_steps"] = list(model.named_steps.keys())
-            for name, step in model.named_steps.items():
-                if step.__class__.__name__ == "ColumnTransformer":
-                    info["column_transformer"] = str(step)
-                    try:
-                        info["transformers_"] = [(n, str(t.__class__), cols) for (n, t, cols) in step.transformers]
-                    except Exception:
-                        pass
-    except Exception:
-        pass
-    try:
-        info["feature_names_in_"] = list(getattr(model, "feature_names_in_", []))
-    except Exception:
-        pass
-    return info
-# =========================
-# Handler
-# =========================
-class EndpointHandler:
-    def __init__(self, path: str = "/repository") -> None:
-        _patch_check_unknown()  # apply safety patch early
-        model_path = os.path.join(path, "model.joblib")
-        self.model = joblib.load(model_path)
-        # Threshold (UI also reads this if present in response)
-        try:
-            self.threshold = float(os.getenv("THRESHOLD", "0.38"))
-        except Exception:
-            self.threshold = 0.38
-        # Optional explainer (for old models); XGB wrapper may provide .top_contrib instead
-        self.explainer = getattr(self.model, "explainer_", None)
-        # Sanitize OneHotEncoder categories (if present)
-        _sanitize_onehot_categories(self.model)
-        print("[handler] Model loaded", flush=True)
-        print(f"[handler] Using threshold: {self.threshold}", flush=True)
-    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
-        debug   = bool(data.get("debug", False))
-        explain = bool(data.get("explain", False))
-        rows = data.get("inputs") or []
-        if isinstance(rows, dict):
-            rows = [rows]
-        if not isinstance(rows, list) or not rows:
-            return {"error": "inputs must be a non-empty list of records", "threshold": self.threshold}
-        df = _coerce_dataframe(rows)
-        debug_info = {
-            "columns": list(df.columns),
-            "dtypes": {c: str(df[c].dtype) for c in df.columns},
-            "threshold": self.threshold,
-            "model": _introspect_model(self.model),
-            "head": df.head(1).to_dict(orient="records"),
-        }
-        # Predict
-        try:
-            if hasattr(self.model, "predict_proba"):
-                proba = self.model.predict_proba(df)[:, 1].astype(float)
-            else:
-                # e.g., model exposes only decision_function
-                raw = self.model.predict(df).astype(float)
-                proba = 1.0 / (1.0 + np.exp(-raw))
-        except Exception as e:
-            return {
-                "error": f"model.predict failed: {e}",
-                "trace": traceback.format_exc(),
-                "debug": debug_info,
-                "threshold": self.threshold,
-            }
-        p = float(proba[0])
-        label = int(p >= self.threshold)
-        resp: Dict[str, Any] = {
-            "risk_probability": p,
-            "risk_label": label,
-            "threshold": self.threshold,  # echo for the UI
-        }
-        # Explanations
-        if explain:
-            # Preferred path: XGB wrapper implements top_contrib()
-            if hasattr(self.model, "top_contrib"):
-                try:
-                    names, vals = self.model.top_contrib(df, k=5)
-                    if names:
-                        resp["shap"] = {"feature_names": names, "values": vals}
-                except Exception as e:
-                    resp["shap_error"] = f"top_contrib failed: {e}"
-            # Fallback: use stored explainer_ if present
-            elif self.explainer is not None:
-                try:
-                    shap_vals = self.explainer(df)
-                    vals = shap_vals.values[0] if hasattr(shap_vals, "values") else shap_vals[0]
-                    contrib = []
-                    for feat in EXPLAIN_ORDER:
-                        if feat in df.columns:
-                            idx = list(df.columns).index(feat)
-                            contrib.append({"feature": feat, "effect": float(vals[idx])})
-                    resp["shap"] = {"contrib": contrib}
-                except Exception as e:
-                    resp["shap_error"] = f"explainer failed: {e}"
-        if debug:
-            resp["debug"] = debug_info
-        # Optional console log (visible in Endpoint Logs)
-        try:
-            print(f"[handler] prob={p:.4f} label={label}", flush=True)
-        except Exception:
-            pass
-        return resp

+# handler.py — Quantium insights Inference Endpoint (fixes XGBWrappedModel unpickle + Residence_type)
+import os
+import sys
+import types
+import json
+import traceback
+from typing import Any, Dict, List, Tuple
+import joblib
+import numpy as np
+import pandas as pd
+# =========================
+# Re-declare the custom wrapper class and register it where pickle expects it
+# =========================
+class XGBWrappedModel:
+    """
+    Wrapper saved in model.joblib:
+      - preprocessor_: sklearn ColumnTransformer
+      - model_: XGBClassifier (or similar exposing predict_proba)
+      - explainer_: optional SHAP explainer
+      - feature_names_out_: names after preprocessing
+    Provides:
+      - predict_proba(X_df)
+      - top_contrib(X_df, k)
+    """
+    def __init__(self, preprocessor=None, booster=None, explainer=None,
+                 feat_names_out=None, cat_prefix="cat__", num_prefix="num__"):
+        self.preprocessor_ = preprocessor
+        self.model_ = booster
+        self.explainer_ = explainer
+        self.feature_names_out_ = np.array(feat_names_out).astype(str) if feat_names_out is not None else None
+        self.cat_prefix = cat_prefix
+        self.num_prefix = num_prefix
+    def predict_proba(self, X_df: pd.DataFrame):
+        Z = self.preprocessor_.transform(X_df)
+        # XGBoost exposes predict_proba for binary: shape (n, 2)
+        return self.model_.predict_proba(Z)
+    def top_contrib(self, X_df: pd.DataFrame, k: int = 5) -> Tuple[List[str], List[float]]:
+        if self.explainer_ is None:
+            return [], []
+        Z = self.preprocessor_.transform(X_df)
+        try:
+            sv = self.explainer_.shap_values(Z)
+            if isinstance(sv, list):
+                sv = sv[1] if len(sv) > 1 else sv[0]
+        except Exception:
+            res = self.explainer_(Z)
+            sv = res.values
+        sv_row = np.array(sv[0], dtype=float)
+        def to_orig(name: str) -> str:
+            if name.startswith(self.cat_prefix):
+                return name[len(self.cat_prefix):].split("_", 1)[0]
+            if name.startswith(self.num_prefix):
+                return name[len(self.num_prefix):]
+            return name.split("_", 1)[0]
+        if self.feature_names_out_ is None:
+            names_out = [f"f{i}" for i in range(len(sv_row))]
+        else:
+            names_out = list(self.feature_names_out_)
+        orig_names = [to_orig(n) for n in names_out]
+        abs_sum: Dict[str, float] = {}
+        signed_sum: Dict[str, float] = {}
+        for n, v in zip(orig_names, sv_row):
+            abs_sum[n] = abs_sum.get(n, 0.0) + abs(float(v))
+            signed_sum[n] = signed_sum.get(n, 0.0) + float(v)
+        ranked = sorted(abs_sum.items(), key=lambda kv: kv[1], reverse=True)[:k]
+        names = [n for n, _ in ranked]
+        values = [signed_sum[n] for n, _ in ranked]
+        return names, values
+# Register class under the module names pickle may look for
+# (your training run saved it from __main__; sometimes from 'train_export_xgb')
+sys.modules['__main__'].__dict__['XGBWrappedModel'] = XGBWrappedModel
+if 'train_export_xgb' not in sys.modules:
+    sys.modules['train_export_xgb'] = types.ModuleType('train_export_xgb')
+sys.modules['train_export_xgb'].__dict__['XGBWrappedModel'] = XGBWrappedModel
+# =========================
+# Feature schema (canonical)
+# =========================
+NUMERIC_COLS = ["age", "avg_glucose_level", "bmi", "hypertension", "heart_disease"]
+# Canonical Residence key uses capital R
+CAT_COLS     = ["gender", "ever_married", "work_type", "smoking_status", "Residence_type"]
+ALL_CANON    = NUMERIC_COLS + CAT_COLS
+EXPLAIN_ORDER = [
+    "age", "avg_glucose_level", "bmi", "hypertension", "heart_disease",
+    "gender", "ever_married", "work_type", "smoking_status", "Residence_type"
+]
+# =========================
+# Utility: dtype coercion
+# =========================
+def _to_int01(x: Any) -> int:
+    if isinstance(x, (bool, np.bool_)):
+        return int(bool(x))
+    try:
+        if isinstance(x, str):
+            s = x.strip().lower()
+            if s in {"1", "true", "t", "yes", "y"}:
+                return 1
+            if s in {"0", "false", "f", "no", "n"}:
+                return 0
+        return int(float(x))
+    except Exception:
+        return 0
+def _coerce_dataframe(rows: List[Dict[str, Any]]) -> pd.DataFrame:
+    """
+    Build a clean DataFrame:
+    - Canonical Residence key is 'Residence_type' (capital R).
+    - Accept 'residence_type' and map it to 'Residence_type' if needed.
+    - Ensure numerics are float64 and 0/1 flags are ints then float64.
+    - Ensure categoricals are plain strings (object), no NA.
+    - Also mirror lowercase 'residence_type' for legacy models.
+    """
+    norm_rows: List[Dict[str, Any]] = []
+    for r in rows:
+        r = dict(r or {})
+        if "Residence_type" not in r and "residence_type" in r:
+            r["Residence_type"] = r["residence_type"]
+        entry = {k: r.get(k, None) for k in ALL_CANON}
+        norm_rows.append(entry)
+    df = pd.DataFrame(norm_rows, columns=ALL_CANON)
+    for col in ["hypertension", "heart_disease"]:
+        df[col] = df[col].map(_to_int01)
+    for col in ["age", "avg_glucose_level", "bmi"]:
+        df[col] = pd.to_numeric(df[col], errors="coerce")
+    df[NUMERIC_COLS] = df[NUMERIC_COLS].astype("float64")
+    for col in CAT_COLS:
+        df[col] = df[col].where(df[col].notna(), "Unknown")
+        df[col] = df[col].map(lambda v: "Unknown" if v is None else str(v)).astype(object)
+    # Mirror lowercase for backward compatibility
+    df["residence_type"] = df["Residence_type"].astype(object)
+    return df
+# =========================
+# Safety patches for OHE
+# =========================
+def _iter_estimators(est):
+    yield est
+    if hasattr(est, "named_steps"):
+        for step in est.named_steps.values():
+            yield from _iter_estimators(step)
+    if hasattr(est, "transformers"):
+        for _, tr, _ in est.transformers:
+            yield from _iter_estimators(tr)
+def _numeric_like(x) -> bool:
+    if x is None:
+        return True
+    if isinstance(x, (int, np.integer, float, np.floating)):
+        return True
+    if isinstance(x, str):
+        try:
+            float(x)
+            return True
+        except Exception:
+            return False
+    return False
+def _sanitize_onehot_categories(model):
+    """Coerce OneHotEncoder.categories_ to consistent dtypes to avoid np.isnan crashes."""
+    try:
+        from sklearn.preprocessing import OneHotEncoder  # type: ignore
+    except Exception:
+        OneHotEncoder = None
+    if OneHotEncoder is None:
+        return
+    for node in _iter_estimators(model):
+        if isinstance(node, OneHotEncoder) and hasattr(node, "categories_"):
+            new_cats = []
+            for cats in node.categories_:
+                arr = np.asarray(cats, dtype=object)
+                if all(_numeric_like(v) for v in arr):
+                    vals = []
+                    for v in arr:
+                        try:
+                            vals.append(np.nan if v is None else float(v))
+                        except Exception:
+                            vals.append(np.nan)
+                    new_cats.append(np.asarray(vals, dtype=float))
+                else:
+                    strs = ["Unknown" if (v is None or (isinstance(v, float) and np.isnan(v))) else str(v) for v in arr]
+                    new_cats.append(np.asarray(strs, dtype=object))
+            node.categories_ = new_cats
+            if hasattr(node, "handle_unknown"):
+                node.handle_unknown = "ignore"
+def _patch_check_unknown():
+    """Patch sklearn _check_unknown to avoid np.isnan on object arrays (older builds)."""
+    try:
+        from sklearn.utils import _encode  # type: ignore
+        _orig = _encode._check_unknown
+        def _safe_check_unknown(values, known_values, return_mask=False):
+            try:
+                return _orig(values, known_values, return_mask=return_mask)
+            except TypeError:
+                vals  = np.asarray(values, dtype=object)
+                known = np.asarray(known_values, dtype=object)
+                mask = np.isin(vals, known, assume_unique=False)
+                diff = vals[~mask]
+                if return_mask:
+                    return diff, mask
+                return diff
+        _encode._check_unknown = _safe_check_unknown  # type: ignore[attr-defined]
+        print("[handler] Patched sklearn.utils._encode._check_unknown", flush=True)
+    except Exception as e:
+        print(f"[handler] Patch for _check_unknown not applied: {e}", flush=True)
+# =========================
+# Model introspection (debug)
+# =========================
+def _introspect_model(model) -> Dict[str, Any]:
+    info: Dict[str, Any] = {"type": str(type(model))}
+    try:
+        if hasattr(model, "named_steps"):
+            info["pipeline_steps"] = list(model.named_steps.keys())
+            for name, step in model.named_steps.items():
+                if step.__class__.__name__ == "ColumnTransformer":
+                    info["column_transformer"] = str(step)
+                    try:
+                        info["transformers_"] = [(n, str(t.__class__), cols) for (n, t, cols) in step.transformers]
+                    except Exception:
+                        pass
+    except Exception:
+        pass
+    try:
+        info["feature_names_in_"] = list(getattr(model, "feature_names_in_", []))
+    except Exception:
+        pass
+    return info
+# =========================
+# Handler
+# =========================
+class EndpointHandler:
+    def __init__(self, path: str = "/repository") -> None:
+        _patch_check_unknown()  # apply safety patch early
+        model_path = os.path.join(path, "model.joblib")
+        self.model = joblib.load(model_path)
+        try:
+            self.threshold = float(os.getenv("THRESHOLD", "0.38"))
+        except Exception:
+            self.threshold = 0.38
+        self.explainer = getattr(self.model, "explainer_", None)
+        _sanitize_onehot_categories(self.model)
+        print("[handler] Model loaded", flush=True)
+        print(f"[handler] Using threshold: {self.threshold}", flush=True)
+    def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
+        debug   = bool(data.get("debug", False))
+        explain = bool(data.get("explain", False))
+        rows = data.get("inputs") or []
+        if isinstance(rows, dict):
+            rows = [rows]
+        if not isinstance(rows, list) or not rows:
+            return {"error": "inputs must be a non-empty list of records", "threshold": self.threshold}
+        df = _coerce_dataframe(rows)
+        debug_info = {
+            "columns": list(df.columns),
+            "dtypes": {c: str(df[c].dtype) for c in df.columns},
+            "threshold": self.threshold,
+            "model": _introspect_model(self.model),
+            "head": df.head(1).to_dict(orient="records"),
+        }
+        # Predict
+        try:
+            if hasattr(self.model, "predict_proba"):
+                proba = self.model.predict_proba(df)[:, 1].astype(float)
+            else:
+                raw = self.model.predict(df).astype(float)
+                proba = 1.0 / (1.0 + np.exp(-raw))
+        except Exception as e:
+            return {
+                "error": f"model.predict failed: {e}",
+                "trace": traceback.format_exc(),
+                "debug": debug_info,
+                "threshold": self.threshold,
+            }
+        p = float(proba[0])
+        label = int(p >= self.threshold)
+        resp: Dict[str, Any] = {
+            "risk_probability": p,
+            "risk_label": label,
+            "threshold": self.threshold,
+        }
+        if explain:
+            if hasattr(self.model, "top_contrib"):
+                try:
+                    names, vals = self.model.top_contrib(df, k=5)
+                    if names:
+                        resp["shap"] = {"feature_names": names, "values": vals}
+                except Exception as e:
+                    resp["shap_error"] = f"top_contrib failed: {e}"
+            elif self.explainer is not None:
+                try:
+                    shap_vals = self.explainer(df)
+                    vals = shap_vals.values[0] if hasattr(shap_vals, "values") else shap_vals[0]
+                    contrib = []
+                    for feat in EXPLAIN_ORDER:
+                        if feat in df.columns:
+                            idx = list(df.columns).index(feat)
+                            contrib.append({"feature": feat, "effect": float(vals[idx])})
+                    resp["shap"] = {"contrib": contrib}
+                except Exception as e:
+                    resp["shap_error"] = f"explainer failed: {e}"
+        if debug:
+            resp["debug"] = debug_info
+        try:
+            print(f"[handler] prob={p:.4f} label={label}", flush=True)
+        except Exception:
+            pass
+        return resp