sathishaiuse commited on
Commit
427c6bb
·
verified ·
1 Parent(s): ae80e55

Update predict_utils.py

Browse files
Files changed (1) hide show
  1. predict_utils.py +260 -43
predict_utils.py CHANGED
@@ -1,21 +1,19 @@
 
1
  import os
2
- import joblib
3
  import logging
 
4
  from huggingface_hub import hf_hub_download
5
 
6
- # -----------------------------------------------------------
7
- # Logging Setup
8
- # -----------------------------------------------------------
9
  logging.basicConfig(level=logging.INFO)
10
  logger = logging.getLogger(__name__)
11
 
12
- # -----------------------------------------------------------
13
- # Environment Variables
14
- # -----------------------------------------------------------
15
  HF_MODEL_REPO = os.getenv("HF_MODEL_REPO", "sathishaiuse/wellness-classifier-model")
16
  HF_MODEL_FILENAME = os.getenv("HF_MODEL_FILENAME", "best_overall_XGBoost.joblib")
17
  HF_TOKEN = os.getenv("HF_TOKEN") or None
18
 
 
19
  LOCAL_CANDIDATES = [
20
  os.path.join("/app", HF_MODEL_FILENAME),
21
  os.path.join("/tmp", HF_MODEL_FILENAME),
@@ -23,62 +21,281 @@ LOCAL_CANDIDATES = [
23
  HF_MODEL_FILENAME
24
  ]
25
 
26
- # -----------------------------------------------------------
27
- # Model Loader
28
- # -----------------------------------------------------------
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  def load_model():
30
  logger.info("==== MODEL LOAD START ====")
31
  logger.info(f"Repo: {HF_MODEL_REPO}")
32
  logger.info(f"Filename: {HF_MODEL_FILENAME}")
33
  logger.info(f"HF_TOKEN present? {bool(HF_TOKEN)}")
34
 
35
- # 1) Try local paths
36
  for path in LOCAL_CANDIDATES:
37
  try:
38
- if os.path.exists(path):
39
- logger.info(f"Attempting to load local model: {path}")
40
- model = joblib.load(path)
41
- logger.info("Model loaded successfully from local file.")
42
- return model
 
 
 
 
 
 
 
 
 
 
43
  except Exception as e:
44
- logger.exception(f"Failed reading local model at {path}: {e}")
45
 
46
- # 2) Try downloading from Hugging Face Hub
47
  try:
48
  logger.info(f"Trying hf_hub_download from {HF_MODEL_REPO}/{HF_MODEL_FILENAME}")
49
- model_path = hf_hub_download(
50
- repo_id=HF_MODEL_REPO,
51
- filename=HF_MODEL_FILENAME,
52
- token=HF_TOKEN
53
- )
54
  logger.info(f"Downloaded model to: {model_path}")
55
- model = joblib.load(model_path)
56
- logger.info("Model loaded successfully from HF Hub.")
57
- return model
58
- except Exception as e:
59
- logger.exception(f"hf_hub_download failed: {e}")
 
60
 
61
- # 3) Fast fail — model not available
62
- logger.error("❌ Model could NOT be loaded from local or HF Hub.")
63
- return None
64
 
 
 
 
 
 
65
 
66
- # -----------------------------------------------------------
67
- # Prediction Function
68
- # -----------------------------------------------------------
69
- def predict(model, features: dict):
 
 
 
 
 
 
 
 
 
70
  if model is None:
71
  return {"error": "Model not loaded"}
72
 
73
  try:
74
- df = features # Streamlit form already creates proper dict
75
- probabilities = model.predict_proba([list(df.values())])[0]
76
- prediction = model.predict([list(df.values())])[0]
77
-
78
- return {
79
- "prediction": prediction,
80
- "probability": probabilities.max()
81
- }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  except Exception as e:
83
  logger.exception(f"Prediction error: {e}")
84
  return {"error": str(e)}
 
1
+ # predict_utils.py
2
  import os
 
3
  import logging
4
+ import joblib
5
  from huggingface_hub import hf_hub_download
6
 
7
+ # Standard logging
 
 
8
  logging.basicConfig(level=logging.INFO)
9
  logger = logging.getLogger(__name__)
10
 
11
+ # Env vars
 
 
12
  HF_MODEL_REPO = os.getenv("HF_MODEL_REPO", "sathishaiuse/wellness-classifier-model")
13
  HF_MODEL_FILENAME = os.getenv("HF_MODEL_FILENAME", "best_overall_XGBoost.joblib")
14
  HF_TOKEN = os.getenv("HF_TOKEN") or None
15
 
16
+ # Local candidate paths to look for the model file
17
  LOCAL_CANDIDATES = [
18
  os.path.join("/app", HF_MODEL_FILENAME),
19
  os.path.join("/tmp", HF_MODEL_FILENAME),
 
21
  HF_MODEL_FILENAME
22
  ]
23
 
24
+ # -------------------------
25
+ # Helpers: inspect, try loaders
26
+ # -------------------------
27
+ def inspect_file(path):
28
+ info = {"path": path, "exists": False}
29
+ try:
30
+ info["exists"] = os.path.exists(path)
31
+ if not info["exists"]:
32
+ return info
33
+ info["size"] = os.path.getsize(path)
34
+ with open(path, "rb") as f:
35
+ head = f.read(1024)
36
+ info["head_bytes"] = head
37
+ try:
38
+ info["head_text"] = head.decode("utf-8", errors="replace")
39
+ except:
40
+ info["head_text"] = None
41
+ except Exception as e:
42
+ info["inspect_error"] = str(e)
43
+ return info
44
+
45
+ def try_joblib_load(path):
46
+ try:
47
+ logger.info(f"Trying joblib.load on {path}")
48
+ m = joblib.load(path)
49
+ logger.info("joblib.load succeeded")
50
+ return ("joblib", m)
51
+ except Exception as e:
52
+ logger.exception(f"joblib.load failed: {e}")
53
+ return ("joblib", e)
54
+
55
+ def try_xgboost_booster(path):
56
+ try:
57
+ import xgboost as xgb
58
+ except Exception as e:
59
+ logger.exception(f"xgboost import failed: {e}")
60
+ return ("xgboost_import", e)
61
+
62
+ try:
63
+ logger.info(f"Trying xgboost.Booster().load_model on {path}")
64
+ booster = xgb.Booster()
65
+ booster.load_model(path)
66
+ logger.info("xgboost.Booster.load_model succeeded")
67
+
68
+ class BoosterWrapper:
69
+ def __init__(self, booster):
70
+ self.booster = booster
71
+ self._is_xgb_booster = True
72
+
73
+ def predict(self, X):
74
+ # X -> 2D list/array
75
+ import numpy as _np, xgboost as _xgb
76
+ arr = _np.array(X, dtype=float)
77
+ dmat = _xgb.DMatrix(arr)
78
+ pred = self.booster.predict(dmat)
79
+ # binary prob -> class decision
80
+ if pred.ndim == 1:
81
+ return (_np.where(pred >= 0.5, 1, 0)).tolist()
82
+ return pred.tolist()
83
+
84
+ def predict_proba(self, X):
85
+ import numpy as _np, xgboost as _xgb
86
+ arr = _np.array(X, dtype=float)
87
+ dmat = _xgb.DMatrix(arr)
88
+ pred = self.booster.predict(dmat)
89
+ if pred.ndim == 1:
90
+ return (_np.vstack([1 - pred, pred]).T).tolist()
91
+ return pred.tolist()
92
+
93
+ return ("xgboost_booster", BoosterWrapper(booster))
94
+ except Exception as e:
95
+ logger.exception(f"xgboost.Booster.load_model failed: {e}")
96
+ return ("xgboost_booster", e)
97
+
98
+ # -------------------------
99
+ # Core loader
100
+ # -------------------------
101
  def load_model():
102
  logger.info("==== MODEL LOAD START ====")
103
  logger.info(f"Repo: {HF_MODEL_REPO}")
104
  logger.info(f"Filename: {HF_MODEL_FILENAME}")
105
  logger.info(f"HF_TOKEN present? {bool(HF_TOKEN)}")
106
 
107
+ # Try local candidates
108
  for path in LOCAL_CANDIDATES:
109
  try:
110
+ info = inspect_file(path)
111
+ logger.info(f"Inspecting local candidate: {info}")
112
+ if not info.get("exists"):
113
+ continue
114
+
115
+ # try joblib
116
+ t, res = try_joblib_load(path)
117
+ if t == "joblib" and not isinstance(res, Exception):
118
+ return res
119
+
120
+ # try xgboost booster
121
+ t, res = try_xgboost_booster(path)
122
+ if t == "xgboost_booster" and not isinstance(res, Exception):
123
+ return res
124
+
125
  except Exception as e:
126
+ logger.exception(f"Error while trying local candidate {path}: {e}")
127
 
128
+ # Try HF hub download
129
  try:
130
  logger.info(f"Trying hf_hub_download from {HF_MODEL_REPO}/{HF_MODEL_FILENAME}")
131
+ model_path = hf_hub_download(repo_id=HF_MODEL_REPO, filename=HF_MODEL_FILENAME, token=HF_TOKEN)
 
 
 
 
132
  logger.info(f"Downloaded model to: {model_path}")
133
+ info = inspect_file(model_path)
134
+ logger.info(f"Inspecting downloaded file: {info}")
135
+
136
+ t, res = try_joblib_load(model_path)
137
+ if t == "joblib" and not isinstance(res, Exception):
138
+ return res
139
 
140
+ t, res = try_xgboost_booster(model_path)
141
+ if t == "xgboost_booster" and not isinstance(res, Exception):
142
+ return res
143
 
144
+ logger.error("Tried joblib and xgboost loader on downloaded file but both failed.")
145
+ return None
146
+ except Exception as e:
147
+ logger.exception(f"hf_hub_download failed: {e}")
148
+ return None
149
 
150
+ # -------------------------
151
+ # Prediction helper (robust)
152
+ # Accepts: features as dict, list, or list-of-lists
153
+ # Ensures sklearn pipelines that need DataFrame get a pandas.DataFrame
154
+ # -------------------------
155
+ def predict(model, features):
156
+ """
157
+ model: object returned by load_model()
158
+ features: dict (preferred) mapping column_name -> value (order preserved),
159
+ OR list/tuple representing feature vector in correct order,
160
+ OR list-of-lists for batch.
161
+ Returns: {"prediction": ..., "probability": ...} or {"error": "..."}
162
+ """
163
  if model is None:
164
  return {"error": "Model not loaded"}
165
 
166
  try:
167
+ # detect xgboost booster wrapper (we set attribute _is_xgb_booster)
168
+ is_booster = hasattr(model, "_is_xgb_booster")
169
+
170
+ # prepare input for sklearn-pipeline style models: DataFrame with column names
171
+ import pandas as _pd
172
+ import numpy as _np
173
+
174
+ # Case A: features is a dict -> preserve key order and create single-row DataFrame
175
+ if isinstance(features, dict):
176
+ # ensure keys are strings (column names the pipeline expects)
177
+ col_names = [str(k) for k in features.keys()]
178
+ row_values = [features[k] for k in features.keys()]
179
+ # Create DataFrame preserving column order
180
+ df = _pd.DataFrame([row_values], columns=col_names)
181
+ logger.info(f"Prepared DataFrame for prediction with columns: {col_names}")
182
+
183
+ if is_booster:
184
+ # booster expects numeric array
185
+ arr = df.values.astype(float)
186
+ preds = model.predict(arr)
187
+ prob = None
188
+ if hasattr(model, "predict_proba"):
189
+ p = model.predict_proba(arr)
190
+ try:
191
+ prob = float(p[0][1])
192
+ except:
193
+ prob = None
194
+ pred_val = int(preds[0]) if isinstance(preds, (list, tuple)) else int(preds)
195
+ return {"prediction": pred_val, "probability": prob}
196
+
197
+ # sklearn-like pipeline
198
+ if hasattr(model, "predict"):
199
+ pred = model.predict(df)[0]
200
+ prob = None
201
+ if hasattr(model, "predict_proba"):
202
+ p = model.predict_proba(df)[0]
203
+ try:
204
+ prob = float(max(p))
205
+ except:
206
+ prob = None
207
+ # convert numpy types to native
208
+ try:
209
+ pred = int(pred)
210
+ except:
211
+ pass
212
+ return {"prediction": pred, "probability": prob}
213
+
214
+ return {"error": "Loaded model object not recognized (no predict method)"}
215
+
216
+ # Case B: features is list or tuple -> single row without column names
217
+ if isinstance(features, (list, tuple)):
218
+ # single-row list
219
+ arr2d = _np.array([features], dtype=float)
220
+ if is_booster:
221
+ preds = model.predict(arr2d)
222
+ prob = None
223
+ if hasattr(model, "predict_proba"):
224
+ p = model.predict_proba(arr2d)
225
+ try:
226
+ prob = float(p[0][1])
227
+ except:
228
+ prob = None
229
+ pred_val = int(preds[0]) if isinstance(preds, (list, tuple)) else int(preds)
230
+ return {"prediction": pred_val, "probability": prob}
231
+
232
+ # sklearn pipeline without column names -> create DataFrame with numeric column names
233
+ # but many scikit-learn ColumnTransformer setups expect string column names; this is risky.
234
+ # Try passing numpy array directly to predict() if pipeline accepts it.
235
+ if hasattr(model, "predict"):
236
+ try:
237
+ pred = model.predict(arr2d)[0]
238
+ prob = None
239
+ if hasattr(model, "predict_proba"):
240
+ p = model.predict_proba(arr2d)[0]
241
+ try:
242
+ prob = float(max(p))
243
+ except:
244
+ prob = None
245
+ return {"prediction": pred, "probability": prob}
246
+ except Exception as e:
247
+ # as last resort, build DataFrame with string column names "0","1",... and hope pipeline uses positional selection
248
+ cols = [str(i) for i in range(arr2d.shape[1])]
249
+ df = _pd.DataFrame(arr2d, columns=cols)
250
+ pred = model.predict(df)[0]
251
+ prob = None
252
+ if hasattr(model, "predict_proba"):
253
+ p = model.predict_proba(df)[0]
254
+ try:
255
+ prob = float(max(p))
256
+ except:
257
+ prob = None
258
+ return {"prediction": pred, "probability": prob}
259
+
260
+ # Case C: features is list-of-lists (batch)
261
+ if isinstance(features, list) and len(features) > 0 and isinstance(features[0], (list, tuple)):
262
+ arr = _np.array(features, dtype=float)
263
+ if is_booster:
264
+ preds = model.predict(arr)
265
+ prob = None
266
+ if hasattr(model, "predict_proba"):
267
+ p = model.predict_proba(arr)
268
+ try:
269
+ prob = float(p[0][1])
270
+ except:
271
+ prob = None
272
+ return {"prediction": preds.tolist(), "probability": prob}
273
+ if hasattr(model, "predict"):
274
+ try:
275
+ pred = model.predict(arr)
276
+ prob = None
277
+ if hasattr(model, "predict_proba"):
278
+ p = model.predict_proba(arr)
279
+ try:
280
+ prob = float(max(p[0]))
281
+ except:
282
+ prob = None
283
+ return {"prediction": pred.tolist(), "probability": prob}
284
+ except Exception as e:
285
+ # try DataFrame fallback
286
+ cols = [str(i) for i in range(arr.shape[1])]
287
+ df = _pd.DataFrame(arr, columns=cols)
288
+ pred = model.predict(df)
289
+ prob = None
290
+ if hasattr(model, "predict_proba"):
291
+ p = model.predict_proba(df)
292
+ try:
293
+ prob = float(max(p[0]))
294
+ except:
295
+ prob = None
296
+ return {"prediction": pred.tolist(), "probability": prob}
297
+
298
+ return {"error": "Unsupported features format. Provide dict (col->val) or list of values."}
299
  except Exception as e:
300
  logger.exception(f"Prediction error: {e}")
301
  return {"error": str(e)}