amobionovo commited on
Commit
ed7e4f6
·
verified ·
1 Parent(s): 85ac9b6

Upload 4 files

Browse files
Files changed (4) hide show
  1. handler.py +302 -0
  2. logo.png +0 -0
  3. model.joblib +3 -0
  4. requirements.txt +8 -0
handler.py ADDED
@@ -0,0 +1,302 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # handler.py — Quantium insights Inference Endpoint (Residence_type canonicalized)
2
+ import os
3
+ import json
4
+ import traceback
5
+ from typing import Any, Dict, List, Tuple
6
+
7
+ import joblib
8
+ import numpy as np
9
+ import pandas as pd
10
+
11
+
12
+ # =========================
13
+ # Feature schema (canonical)
14
+ # =========================
15
+ NUMERIC_COLS = ["age", "avg_glucose_level", "bmi", "hypertension", "heart_disease"]
16
+ # Canonical Residence key uses capital R
17
+ CAT_COLS = ["gender", "ever_married", "work_type", "smoking_status", "Residence_type"]
18
+ ALL_CANON = NUMERIC_COLS + CAT_COLS
19
+
20
+ # For explain UI ordering (match canonical names)
21
+ EXPLAIN_ORDER = [
22
+ "age", "avg_glucose_level", "bmi", "hypertension", "heart_disease",
23
+ "gender", "ever_married", "work_type", "smoking_status", "Residence_type"
24
+ ]
25
+
26
+
27
+ # =========================
28
+ # Utility: dtype coercion
29
+ # =========================
30
+ def _to_int01(x: Any) -> int:
31
+ if isinstance(x, (bool, np.bool_)):
32
+ return int(bool(x))
33
+ try:
34
+ if isinstance(x, str):
35
+ s = x.strip().lower()
36
+ if s in {"1", "true", "t", "yes", "y"}:
37
+ return 1
38
+ if s in {"0", "false", "f", "no", "n"}:
39
+ return 0
40
+ return int(float(x))
41
+ except Exception:
42
+ return 0
43
+
44
+
45
+ def _coerce_dataframe(rows: List[Dict[str, Any]]) -> pd.DataFrame:
46
+ """
47
+ Build a clean DataFrame:
48
+ - Canonical Residence key is 'Residence_type' (capital R).
49
+ - Accept 'residence_type' and map it to 'Residence_type' if needed.
50
+ - Ensure numerics are float64 and 0/1 flags are ints then float64.
51
+ - Ensure categoricals are plain Python strings (object), no NA.
52
+ - Also mirror lowercase 'residence_type' for legacy models.
53
+ """
54
+ norm_rows: List[Dict[str, Any]] = []
55
+ for r in rows:
56
+ r = dict(r or {})
57
+ # Normalize residence key to capitalized canonical
58
+ if "Residence_type" not in r and "residence_type" in r:
59
+ r["Residence_type"] = r["residence_type"]
60
+ # Keep only canonical columns
61
+ entry = {k: r.get(k, None) for k in ALL_CANON}
62
+ norm_rows.append(entry)
63
+
64
+ df = pd.DataFrame(norm_rows, columns=ALL_CANON)
65
+
66
+ # binary flags first
67
+ for col in ["hypertension", "heart_disease"]:
68
+ df[col] = df[col].map(_to_int01)
69
+
70
+ # strong numeric coercion
71
+ for col in ["age", "avg_glucose_level", "bmi"]:
72
+ df[col] = pd.to_numeric(df[col], errors="coerce")
73
+
74
+ # final cast to float64
75
+ df[NUMERIC_COLS] = df[NUMERIC_COLS].astype("float64")
76
+
77
+ # categoricals as plain strings, no NA
78
+ for col in CAT_COLS:
79
+ df[col] = df[col].where(df[col].notna(), "Unknown")
80
+ df[col] = df[col].map(lambda v: "Unknown" if v is None else str(v)).astype(object)
81
+
82
+ # Mirror lowercase 'residence_type' for backward compatibility
83
+ df["residence_type"] = df["Residence_type"].astype(object)
84
+
85
+ return df
86
+
87
+
88
+ # =========================
89
+ # Safety patches for OHE
90
+ # =========================
91
+ def _iter_estimators(est):
92
+ yield est
93
+ # Pipelines
94
+ if hasattr(est, "named_steps"):
95
+ for step in est.named_steps.values():
96
+ yield from _iter_estimators(step)
97
+ # ColumnTransformer
98
+ if hasattr(est, "transformers"):
99
+ for _, tr, _ in est.transformers:
100
+ yield from _iter_estimators(tr)
101
+
102
+
103
+ def _numeric_like(x) -> bool:
104
+ if x is None:
105
+ return True
106
+ if isinstance(x, (int, np.integer, float, np.floating)):
107
+ return True
108
+ if isinstance(x, str):
109
+ try:
110
+ float(x)
111
+ return True
112
+ except Exception:
113
+ return False
114
+ return False
115
+
116
+
117
+ def _sanitize_onehot_categories(model):
118
+ """Coerce OneHotEncoder.categories_ to consistent dtypes to avoid np.isnan crashes."""
119
+ try:
120
+ from sklearn.preprocessing import OneHotEncoder # type: ignore
121
+ except Exception:
122
+ OneHotEncoder = None
123
+
124
+ if OneHotEncoder is None:
125
+ return
126
+
127
+ for node in _iter_estimators(model):
128
+ if isinstance(node, OneHotEncoder) and hasattr(node, "categories_"):
129
+ new_cats = []
130
+ for cats in node.categories_:
131
+ arr = np.asarray(cats, dtype=object)
132
+ if all(_numeric_like(v) for v in arr):
133
+ vals = []
134
+ for v in arr:
135
+ try:
136
+ vals.append(np.nan if v is None else float(v))
137
+ except Exception:
138
+ vals.append(np.nan)
139
+ new_cats.append(np.asarray(vals, dtype=float))
140
+ else:
141
+ strs = ["Unknown" if (v is None or (isinstance(v, float) and np.isnan(v))) else str(v) for v in arr]
142
+ new_cats.append(np.asarray(strs, dtype=object))
143
+ node.categories_ = new_cats
144
+ if hasattr(node, "handle_unknown"):
145
+ node.handle_unknown = "ignore"
146
+
147
+
148
+ def _patch_check_unknown():
149
+ """
150
+ Monkey-patch sklearn.utils._encode._check_unknown to avoid np.isnan on object/string arrays
151
+ on certain sklearn builds.
152
+ """
153
+ try:
154
+ from sklearn.utils import _encode # type: ignore
155
+ _orig = _encode._check_unknown
156
+
157
+ def _safe_check_unknown(values, known_values, return_mask=False):
158
+ try:
159
+ return _orig(values, known_values, return_mask=return_mask)
160
+ except TypeError:
161
+ vals = np.asarray(values, dtype=object)
162
+ known = np.asarray(known_values, dtype=object)
163
+ mask = np.isin(vals, known, assume_unique=False)
164
+ diff = vals[~mask]
165
+ if return_mask:
166
+ return diff, mask
167
+ return diff
168
+
169
+ _encode._check_unknown = _safe_check_unknown # type: ignore[attr-defined]
170
+ print("[handler] Patched sklearn.utils._encode._check_unknown", flush=True)
171
+ except Exception as e:
172
+ print(f"[handler] Patch for _check_unknown not applied: {e}", flush=True)
173
+
174
+
175
+ # =========================
176
+ # Model introspection (debug)
177
+ # =========================
178
+ def _introspect_model(model) -> Dict[str, Any]:
179
+ info: Dict[str, Any] = {"type": str(type(model))}
180
+ try:
181
+ if hasattr(model, "named_steps"):
182
+ info["pipeline_steps"] = list(model.named_steps.keys())
183
+ for name, step in model.named_steps.items():
184
+ if step.__class__.__name__ == "ColumnTransformer":
185
+ info["column_transformer"] = str(step)
186
+ try:
187
+ info["transformers_"] = [(n, str(t.__class__), cols) for (n, t, cols) in step.transformers]
188
+ except Exception:
189
+ pass
190
+ except Exception:
191
+ pass
192
+ try:
193
+ info["feature_names_in_"] = list(getattr(model, "feature_names_in_", []))
194
+ except Exception:
195
+ pass
196
+ return info
197
+
198
+
199
+ # =========================
200
+ # Handler
201
+ # =========================
202
+ class EndpointHandler:
203
+ def __init__(self, path: str = "/repository") -> None:
204
+ _patch_check_unknown() # apply safety patch early
205
+
206
+ model_path = os.path.join(path, "model.joblib")
207
+ self.model = joblib.load(model_path)
208
+
209
+ # Threshold (UI also reads this if present in response)
210
+ try:
211
+ self.threshold = float(os.getenv("THRESHOLD", "0.38"))
212
+ except Exception:
213
+ self.threshold = 0.38
214
+
215
+ # Optional explainer (for old models); XGB wrapper may provide .top_contrib instead
216
+ self.explainer = getattr(self.model, "explainer_", None)
217
+
218
+ # Sanitize OneHotEncoder categories (if present)
219
+ _sanitize_onehot_categories(self.model)
220
+
221
+ print("[handler] Model loaded", flush=True)
222
+ print(f"[handler] Using threshold: {self.threshold}", flush=True)
223
+
224
+ def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
225
+ debug = bool(data.get("debug", False))
226
+ explain = bool(data.get("explain", False))
227
+
228
+ rows = data.get("inputs") or []
229
+ if isinstance(rows, dict):
230
+ rows = [rows]
231
+ if not isinstance(rows, list) or not rows:
232
+ return {"error": "inputs must be a non-empty list of records", "threshold": self.threshold}
233
+
234
+ df = _coerce_dataframe(rows)
235
+
236
+ debug_info = {
237
+ "columns": list(df.columns),
238
+ "dtypes": {c: str(df[c].dtype) for c in df.columns},
239
+ "threshold": self.threshold,
240
+ "model": _introspect_model(self.model),
241
+ "head": df.head(1).to_dict(orient="records"),
242
+ }
243
+
244
+ # Predict
245
+ try:
246
+ if hasattr(self.model, "predict_proba"):
247
+ proba = self.model.predict_proba(df)[:, 1].astype(float)
248
+ else:
249
+ # e.g., model exposes only decision_function
250
+ raw = self.model.predict(df).astype(float)
251
+ proba = 1.0 / (1.0 + np.exp(-raw))
252
+ except Exception as e:
253
+ return {
254
+ "error": f"model.predict failed: {e}",
255
+ "trace": traceback.format_exc(),
256
+ "debug": debug_info,
257
+ "threshold": self.threshold,
258
+ }
259
+
260
+ p = float(proba[0])
261
+ label = int(p >= self.threshold)
262
+
263
+ resp: Dict[str, Any] = {
264
+ "risk_probability": p,
265
+ "risk_label": label,
266
+ "threshold": self.threshold, # echo for the UI
267
+ }
268
+
269
+ # Explanations
270
+ if explain:
271
+ # Preferred path: XGB wrapper implements top_contrib()
272
+ if hasattr(self.model, "top_contrib"):
273
+ try:
274
+ names, vals = self.model.top_contrib(df, k=5)
275
+ if names:
276
+ resp["shap"] = {"feature_names": names, "values": vals}
277
+ except Exception as e:
278
+ resp["shap_error"] = f"top_contrib failed: {e}"
279
+ # Fallback: use stored explainer_ if present
280
+ elif self.explainer is not None:
281
+ try:
282
+ shap_vals = self.explainer(df)
283
+ vals = shap_vals.values[0] if hasattr(shap_vals, "values") else shap_vals[0]
284
+ contrib = []
285
+ for feat in EXPLAIN_ORDER:
286
+ if feat in df.columns:
287
+ idx = list(df.columns).index(feat)
288
+ contrib.append({"feature": feat, "effect": float(vals[idx])})
289
+ resp["shap"] = {"contrib": contrib}
290
+ except Exception as e:
291
+ resp["shap_error"] = f"explainer failed: {e}"
292
+
293
+ if debug:
294
+ resp["debug"] = debug_info
295
+
296
+ # Optional console log (visible in Endpoint Logs)
297
+ try:
298
+ print(f"[handler] prob={p:.4f} label={label}", flush=True)
299
+ except Exception:
300
+ pass
301
+
302
+ return resp
logo.png ADDED
model.joblib ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b397f797a1a6106bb2ed6e21ddfeed2738491fdddde9681916cb8c9593cc4e07
3
+ size 1261562
requirements.txt ADDED
@@ -0,0 +1,8 @@
 
 
 
 
 
 
 
 
 
1
+ scikit-learn==1.5.1
2
+ imbalanced-learn==0.12.3
3
+ xgboost==2.0.3
4
+ shap==0.45.0
5
+ numpy==1.26.4
6
+ pandas==2.2.2
7
+ joblib==1.3.2
8
+ scipy==1.11.4