| """ |
| Logistic Regression — PRIMARY model per user decision. |
| |
| Inference logic extracted VERBATIM from MurshidUIPipeline.ipynb (cell 18-19): |
| |
| logreg_model = joblib.load(f"{BASE_PATH}/murshid_logreg_pipeline_manual_oof_pcatuned.joblib") |
| logreg_thr = np.load(f"{BASE_PATH}/murshid_logreg_thresholds_manual_oof_pcatuned.npy") |
| |
| proba = logreg_model.predict_proba(X_user) |
| |
| if isinstance(proba, list): |
| proba = np.column_stack([p[:, 1] for p in proba]) |
| elif proba.ndim == 3: |
| proba = proba[:, :, 1] |
| |
| proba = proba.reshape(-1) |
| |
| pred_logreg = (proba >= logreg_thr).astype(int) |
| conf_logreg = proba * 100 |
| gap_logreg = proba - logreg_thr |
| |
| Original notebook file is NOT modified. |
| """ |
|
|
| from __future__ import annotations |
|
|
| import json |
| from pathlib import Path |
|
|
| import joblib |
| import numpy as np |
|
|
| from app.config import settings |
|
|
|
|
| class LogisticRegressionModel: |
| """ |
| Wraps the trained Logistic Regression pipeline + per-label thresholds. |
| File structure (from notebook cell 18): |
| logreg_model → sklearn Pipeline (PCA-tuned + OneVsRestClassifier(LogReg)) |
| logreg_thr → np.ndarray shape (n_techniques,) per-label thresholds |
| """ |
|
|
| def __init__(self, models_dir: Path | None = None) -> None: |
| base = Path(models_dir or settings.murshid_models_dir).resolve() |
|
|
| logreg_path = base / settings.logreg_joblib |
| thr_path = base / settings.logreg_thresholds_npy |
| labels_path = base / settings.label_columns_json |
|
|
| for p in (logreg_path, thr_path, labels_path): |
| if not p.is_file(): |
| raise FileNotFoundError(f"Missing model file: {p}") |
|
|
| |
| self._model = joblib.load(logreg_path) |
| self._thr = np.load(thr_path) |
|
|
| with open(labels_path, encoding="utf-8") as f: |
| self.technique_names: list[str] = json.load(f) |
|
|
| n = len(self.technique_names) |
| if self._thr.shape[0] != n: |
| raise ValueError( |
| f"LogReg thresholds length {self._thr.shape[0]} != {n} labels" |
| ) |
|
|
| |
|
|
| def predict(self, embedding_1d: np.ndarray) -> list[dict]: |
| """ |
| Run LogReg inference exactly as in notebook cell 19. |
| |
| Returns list of dicts sorted by confidence_percent desc: |
| technique_id, predicted, confidence_percent, proba, threshold, gap |
| """ |
| X_user = embedding_1d.reshape(1, -1) |
|
|
| |
| proba = self._model.predict_proba(X_user) |
|
|
| if isinstance(proba, list): |
| proba = np.column_stack([p[:, 1] for p in proba]) |
| elif proba.ndim == 3: |
| proba = proba[:, :, 1] |
|
|
| proba = proba.reshape(-1) |
|
|
| pred_logreg = (proba >= self._thr).astype(int) |
| conf_logreg = proba * 100 |
| gap_logreg = proba - self._thr |
| |
|
|
| results = [ |
| { |
| "technique_id": self.technique_names[i], |
| "predicted": bool(pred_logreg[i]), |
| "confidence_percent": round(float(conf_logreg[i]), 2), |
| "proba": round(float(proba[i]), 4), |
| "threshold": round(float(self._thr[i]), 4), |
| "gap": round(float(gap_logreg[i]), 4), |
| } |
| for i in range(len(self.technique_names)) |
| ] |
|
|
| |
| return sorted( |
| results, |
| key=lambda r: (r["predicted"], r["confidence_percent"]), |
| reverse=True, |
| ) |
|
|