sathishaiuse commited on
Commit
ad430b5
·
verified ·
1 Parent(s): 427c6bb

Update predict_utils.py

Browse files
Files changed (1) hide show
  1. predict_utils.py +45 -42
predict_utils.py CHANGED
@@ -1,19 +1,17 @@
1
- # predict_utils.py
2
  import os
3
  import logging
4
  import joblib
5
  from huggingface_hub import hf_hub_download
6
 
7
- # Standard logging
8
  logging.basicConfig(level=logging.INFO)
9
  logger = logging.getLogger(__name__)
10
 
11
- # Env vars
12
  HF_MODEL_REPO = os.getenv("HF_MODEL_REPO", "sathishaiuse/wellness-classifier-model")
13
  HF_MODEL_FILENAME = os.getenv("HF_MODEL_FILENAME", "best_overall_XGBoost.joblib")
14
  HF_TOKEN = os.getenv("HF_TOKEN") or None
15
 
16
- # Local candidate paths to look for the model file
17
  LOCAL_CANDIDATES = [
18
  os.path.join("/app", HF_MODEL_FILENAME),
19
  os.path.join("/tmp", HF_MODEL_FILENAME),
@@ -22,7 +20,32 @@ LOCAL_CANDIDATES = [
22
  ]
23
 
24
  # -------------------------
25
- # Helpers: inspect, try loaders
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  # -------------------------
27
  def inspect_file(path):
28
  info = {"path": path, "exists": False}
@@ -44,6 +67,8 @@ def inspect_file(path):
44
 
45
  def try_joblib_load(path):
46
  try:
 
 
47
  logger.info(f"Trying joblib.load on {path}")
48
  m = joblib.load(path)
49
  logger.info("joblib.load succeeded")
@@ -71,13 +96,11 @@ def try_xgboost_booster(path):
71
  self._is_xgb_booster = True
72
 
73
  def predict(self, X):
74
- # X -> 2D list/array
75
  import numpy as _np, xgboost as _xgb
76
  arr = _np.array(X, dtype=float)
77
  dmat = _xgb.DMatrix(arr)
78
  pred = self.booster.predict(dmat)
79
- # binary prob -> class decision
80
- if pred.ndim == 1:
81
  return (_np.where(pred >= 0.5, 1, 0)).tolist()
82
  return pred.tolist()
83
 
@@ -86,7 +109,7 @@ def try_xgboost_booster(path):
86
  arr = _np.array(X, dtype=float)
87
  dmat = _xgb.DMatrix(arr)
88
  pred = self.booster.predict(dmat)
89
- if pred.ndim == 1:
90
  return (_np.vstack([1 - pred, pred]).T).tolist()
91
  return pred.tolist()
92
 
@@ -96,7 +119,7 @@ def try_xgboost_booster(path):
96
  return ("xgboost_booster", e)
97
 
98
  # -------------------------
99
- # Core loader
100
  # -------------------------
101
  def load_model():
102
  logger.info("==== MODEL LOAD START ====")
@@ -104,7 +127,6 @@ def load_model():
104
  logger.info(f"Filename: {HF_MODEL_FILENAME}")
105
  logger.info(f"HF_TOKEN present? {bool(HF_TOKEN)}")
106
 
107
- # Try local candidates
108
  for path in LOCAL_CANDIDATES:
109
  try:
110
  info = inspect_file(path)
@@ -112,12 +134,10 @@ def load_model():
112
  if not info.get("exists"):
113
  continue
114
 
115
- # try joblib
116
  t, res = try_joblib_load(path)
117
  if t == "joblib" and not isinstance(res, Exception):
118
  return res
119
 
120
- # try xgboost booster
121
  t, res = try_xgboost_booster(path)
122
  if t == "xgboost_booster" and not isinstance(res, Exception):
123
  return res
@@ -125,7 +145,6 @@ def load_model():
125
  except Exception as e:
126
  logger.exception(f"Error while trying local candidate {path}: {e}")
127
 
128
- # Try HF hub download
129
  try:
130
  logger.info(f"Trying hf_hub_download from {HF_MODEL_REPO}/{HF_MODEL_FILENAME}")
131
  model_path = hf_hub_download(repo_id=HF_MODEL_REPO, filename=HF_MODEL_FILENAME, token=HF_TOKEN)
@@ -148,40 +167,30 @@ def load_model():
148
  return None
149
 
150
  # -------------------------
151
- # Prediction helper (robust)
152
- # Accepts: features as dict, list, or list-of-lists
153
- # Ensures sklearn pipelines that need DataFrame get a pandas.DataFrame
154
  # -------------------------
155
  def predict(model, features):
156
  """
157
- model: object returned by load_model()
158
- features: dict (preferred) mapping column_name -> value (order preserved),
159
- OR list/tuple representing feature vector in correct order,
160
- OR list-of-lists for batch.
161
- Returns: {"prediction": ..., "probability": ...} or {"error": "..."}
162
  """
163
  if model is None:
164
  return {"error": "Model not loaded"}
165
 
166
  try:
167
- # detect xgboost booster wrapper (we set attribute _is_xgb_booster)
168
- is_booster = hasattr(model, "_is_xgb_booster")
169
-
170
- # prepare input for sklearn-pipeline style models: DataFrame with column names
171
  import pandas as _pd
172
  import numpy as _np
173
 
174
- # Case A: features is a dict -> preserve key order and create single-row DataFrame
 
 
175
  if isinstance(features, dict):
176
- # ensure keys are strings (column names the pipeline expects)
177
  col_names = [str(k) for k in features.keys()]
178
  row_values = [features[k] for k in features.keys()]
179
- # Create DataFrame preserving column order
180
  df = _pd.DataFrame([row_values], columns=col_names)
181
  logger.info(f"Prepared DataFrame for prediction with columns: {col_names}")
182
 
183
  if is_booster:
184
- # booster expects numeric array
185
  arr = df.values.astype(float)
186
  preds = model.predict(arr)
187
  prob = None
@@ -194,7 +203,6 @@ def predict(model, features):
194
  pred_val = int(preds[0]) if isinstance(preds, (list, tuple)) else int(preds)
195
  return {"prediction": pred_val, "probability": prob}
196
 
197
- # sklearn-like pipeline
198
  if hasattr(model, "predict"):
199
  pred = model.predict(df)[0]
200
  prob = None
@@ -204,7 +212,6 @@ def predict(model, features):
204
  prob = float(max(p))
205
  except:
206
  prob = None
207
- # convert numpy types to native
208
  try:
209
  pred = int(pred)
210
  except:
@@ -213,9 +220,9 @@ def predict(model, features):
213
 
214
  return {"error": "Loaded model object not recognized (no predict method)"}
215
 
216
- # Case B: features is list or tuple -> single row without column names
 
217
  if isinstance(features, (list, tuple)):
218
- # single-row list
219
  arr2d = _np.array([features], dtype=float)
220
  if is_booster:
221
  preds = model.predict(arr2d)
@@ -229,9 +236,6 @@ def predict(model, features):
229
  pred_val = int(preds[0]) if isinstance(preds, (list, tuple)) else int(preds)
230
  return {"prediction": pred_val, "probability": prob}
231
 
232
- # sklearn pipeline without column names -> create DataFrame with numeric column names
233
- # but many scikit-learn ColumnTransformer setups expect string column names; this is risky.
234
- # Try passing numpy array directly to predict() if pipeline accepts it.
235
  if hasattr(model, "predict"):
236
  try:
237
  pred = model.predict(arr2d)[0]
@@ -243,8 +247,7 @@ def predict(model, features):
243
  except:
244
  prob = None
245
  return {"prediction": pred, "probability": prob}
246
- except Exception as e:
247
- # as last resort, build DataFrame with string column names "0","1",... and hope pipeline uses positional selection
248
  cols = [str(i) for i in range(arr2d.shape[1])]
249
  df = _pd.DataFrame(arr2d, columns=cols)
250
  pred = model.predict(df)[0]
@@ -257,7 +260,7 @@ def predict(model, features):
257
  prob = None
258
  return {"prediction": pred, "probability": prob}
259
 
260
- # Case C: features is list-of-lists (batch)
261
  if isinstance(features, list) and len(features) > 0 and isinstance(features[0], (list, tuple)):
262
  arr = _np.array(features, dtype=float)
263
  if is_booster:
@@ -281,8 +284,7 @@ def predict(model, features):
281
  except:
282
  prob = None
283
  return {"prediction": pred.tolist(), "probability": prob}
284
- except Exception as e:
285
- # try DataFrame fallback
286
  cols = [str(i) for i in range(arr.shape[1])]
287
  df = _pd.DataFrame(arr, columns=cols)
288
  pred = model.predict(df)
@@ -296,6 +298,7 @@ def predict(model, features):
296
  return {"prediction": pred.tolist(), "probability": prob}
297
 
298
  return {"error": "Unsupported features format. Provide dict (col->val) or list of values."}
 
299
  except Exception as e:
300
  logger.exception(f"Prediction error: {e}")
301
  return {"error": str(e)}
 
1
+ # predict_utils.py (patched to handle XGBClassifier use_label_encoder issue + robust loader)
2
  import os
3
  import logging
4
  import joblib
5
  from huggingface_hub import hf_hub_download
6
 
7
+ # Logging
8
  logging.basicConfig(level=logging.INFO)
9
  logger = logging.getLogger(__name__)
10
 
 
11
  HF_MODEL_REPO = os.getenv("HF_MODEL_REPO", "sathishaiuse/wellness-classifier-model")
12
  HF_MODEL_FILENAME = os.getenv("HF_MODEL_FILENAME", "best_overall_XGBoost.joblib")
13
  HF_TOKEN = os.getenv("HF_TOKEN") or None
14
 
 
15
  LOCAL_CANDIDATES = [
16
  os.path.join("/app", HF_MODEL_FILENAME),
17
  os.path.join("/tmp", HF_MODEL_FILENAME),
 
20
  ]
21
 
22
  # -------------------------
23
+ # Monkey-patch xgboost sklearn wrappers to add missing attributes before unpickling
24
+ # This prevents errors like: "'XGBClassifier' object has no attribute 'use_label_encoder'"
25
+ # -------------------------
26
+ def ensure_xgb_sklearn_compat():
27
+ try:
28
+ import xgboost as xgb
29
+ # XGBClassifier
30
+ clf = getattr(xgb, "XGBClassifier", None)
31
+ if clf is not None:
32
+ if not hasattr(clf, "use_label_encoder"):
33
+ setattr(clf, "use_label_encoder", False)
34
+ logger.info("Patched XGBClassifier.use_label_encoder = False")
35
+ # XGBRegressor
36
+ reg = getattr(xgb, "XGBRegressor", None)
37
+ if reg is not None:
38
+ if not hasattr(reg, "use_label_encoder"):
39
+ setattr(reg, "use_label_encoder", False)
40
+ logger.info("Patched XGBRegressor.use_label_encoder = False")
41
+ except Exception as e:
42
+ logger.debug(f"xgboost not available to patch: {e}")
43
+
44
+ # Call the patch early so joblib.load can succeed
45
+ ensure_xgb_sklearn_compat()
46
+
47
+ # -------------------------
48
+ # Helpers
49
  # -------------------------
50
  def inspect_file(path):
51
  info = {"path": path, "exists": False}
 
67
 
68
  def try_joblib_load(path):
69
  try:
70
+ # Ensure patch just before load (in case xgboost gets imported lazily)
71
+ ensure_xgb_sklearn_compat()
72
  logger.info(f"Trying joblib.load on {path}")
73
  m = joblib.load(path)
74
  logger.info("joblib.load succeeded")
 
96
  self._is_xgb_booster = True
97
 
98
  def predict(self, X):
 
99
  import numpy as _np, xgboost as _xgb
100
  arr = _np.array(X, dtype=float)
101
  dmat = _xgb.DMatrix(arr)
102
  pred = self.booster.predict(dmat)
103
+ if hasattr(pred, "ndim") and pred.ndim == 1:
 
104
  return (_np.where(pred >= 0.5, 1, 0)).tolist()
105
  return pred.tolist()
106
 
 
109
  arr = _np.array(X, dtype=float)
110
  dmat = _xgb.DMatrix(arr)
111
  pred = self.booster.predict(dmat)
112
+ if hasattr(pred, "ndim") and pred.ndim == 1:
113
  return (_np.vstack([1 - pred, pred]).T).tolist()
114
  return pred.tolist()
115
 
 
119
  return ("xgboost_booster", e)
120
 
121
  # -------------------------
122
+ # Loader
123
  # -------------------------
124
  def load_model():
125
  logger.info("==== MODEL LOAD START ====")
 
127
  logger.info(f"Filename: {HF_MODEL_FILENAME}")
128
  logger.info(f"HF_TOKEN present? {bool(HF_TOKEN)}")
129
 
 
130
  for path in LOCAL_CANDIDATES:
131
  try:
132
  info = inspect_file(path)
 
134
  if not info.get("exists"):
135
  continue
136
 
 
137
  t, res = try_joblib_load(path)
138
  if t == "joblib" and not isinstance(res, Exception):
139
  return res
140
 
 
141
  t, res = try_xgboost_booster(path)
142
  if t == "xgboost_booster" and not isinstance(res, Exception):
143
  return res
 
145
  except Exception as e:
146
  logger.exception(f"Error while trying local candidate {path}: {e}")
147
 
 
148
  try:
149
  logger.info(f"Trying hf_hub_download from {HF_MODEL_REPO}/{HF_MODEL_FILENAME}")
150
  model_path = hf_hub_download(repo_id=HF_MODEL_REPO, filename=HF_MODEL_FILENAME, token=HF_TOKEN)
 
167
  return None
168
 
169
  # -------------------------
170
+ # Robust predict (creates DataFrame when model expects column names)
 
 
171
  # -------------------------
172
  def predict(model, features):
173
  """
174
+ Accepts dict (col->val), list, or list-of-lists.
175
+ Returns dict with prediction and probability, or error.
 
 
 
176
  """
177
  if model is None:
178
  return {"error": "Model not loaded"}
179
 
180
  try:
 
 
 
 
181
  import pandas as _pd
182
  import numpy as _np
183
 
184
+ is_booster = hasattr(model, "_is_xgb_booster")
185
+
186
+ # dict -> DataFrame with columns in order of keys
187
  if isinstance(features, dict):
 
188
  col_names = [str(k) for k in features.keys()]
189
  row_values = [features[k] for k in features.keys()]
 
190
  df = _pd.DataFrame([row_values], columns=col_names)
191
  logger.info(f"Prepared DataFrame for prediction with columns: {col_names}")
192
 
193
  if is_booster:
 
194
  arr = df.values.astype(float)
195
  preds = model.predict(arr)
196
  prob = None
 
203
  pred_val = int(preds[0]) if isinstance(preds, (list, tuple)) else int(preds)
204
  return {"prediction": pred_val, "probability": prob}
205
 
 
206
  if hasattr(model, "predict"):
207
  pred = model.predict(df)[0]
208
  prob = None
 
212
  prob = float(max(p))
213
  except:
214
  prob = None
 
215
  try:
216
  pred = int(pred)
217
  except:
 
220
 
221
  return {"error": "Loaded model object not recognized (no predict method)"}
222
 
223
+ # list -> numpy array single row
224
+ import numpy as _np
225
  if isinstance(features, (list, tuple)):
 
226
  arr2d = _np.array([features], dtype=float)
227
  if is_booster:
228
  preds = model.predict(arr2d)
 
236
  pred_val = int(preds[0]) if isinstance(preds, (list, tuple)) else int(preds)
237
  return {"prediction": pred_val, "probability": prob}
238
 
 
 
 
239
  if hasattr(model, "predict"):
240
  try:
241
  pred = model.predict(arr2d)[0]
 
247
  except:
248
  prob = None
249
  return {"prediction": pred, "probability": prob}
250
+ except Exception:
 
251
  cols = [str(i) for i in range(arr2d.shape[1])]
252
  df = _pd.DataFrame(arr2d, columns=cols)
253
  pred = model.predict(df)[0]
 
260
  prob = None
261
  return {"prediction": pred, "probability": prob}
262
 
263
+ # batch
264
  if isinstance(features, list) and len(features) > 0 and isinstance(features[0], (list, tuple)):
265
  arr = _np.array(features, dtype=float)
266
  if is_booster:
 
284
  except:
285
  prob = None
286
  return {"prediction": pred.tolist(), "probability": prob}
287
+ except Exception:
 
288
  cols = [str(i) for i in range(arr.shape[1])]
289
  df = _pd.DataFrame(arr, columns=cols)
290
  pred = model.predict(df)
 
298
  return {"prediction": pred.tolist(), "probability": prob}
299
 
300
  return {"error": "Unsupported features format. Provide dict (col->val) or list of values."}
301
+
302
  except Exception as e:
303
  logger.exception(f"Prediction error: {e}")
304
  return {"error": str(e)}