""" Logistic Regression — PRIMARY model per user decision. Inference logic extracted VERBATIM from MurshidUIPipeline.ipynb (cell 18-19): logreg_model = joblib.load(f"{BASE_PATH}/murshid_logreg_pipeline_manual_oof_pcatuned.joblib") logreg_thr = np.load(f"{BASE_PATH}/murshid_logreg_thresholds_manual_oof_pcatuned.npy") proba = logreg_model.predict_proba(X_user) if isinstance(proba, list): proba = np.column_stack([p[:, 1] for p in proba]) elif proba.ndim == 3: proba = proba[:, :, 1] proba = proba.reshape(-1) pred_logreg = (proba >= logreg_thr).astype(int) conf_logreg = proba * 100 gap_logreg = proba - logreg_thr Original notebook file is NOT modified. """ from __future__ import annotations import json from pathlib import Path import joblib import numpy as np from app.config import settings class LogisticRegressionModel: """ Wraps the trained Logistic Regression pipeline + per-label thresholds. File structure (from notebook cell 18): logreg_model → sklearn Pipeline (PCA-tuned + OneVsRestClassifier(LogReg)) logreg_thr → np.ndarray shape (n_techniques,) per-label thresholds """ def __init__(self, models_dir: Path | None = None) -> None: base = Path(models_dir or settings.murshid_models_dir).resolve() logreg_path = base / settings.logreg_joblib thr_path = base / settings.logreg_thresholds_npy labels_path = base / settings.label_columns_json for p in (logreg_path, thr_path, labels_path): if not p.is_file(): raise FileNotFoundError(f"Missing model file: {p}") # --- notebook cell 18: load model + thresholds --- self._model = joblib.load(logreg_path) # logreg_model self._thr = np.load(thr_path) # logreg_thr with open(labels_path, encoding="utf-8") as f: self.technique_names: list[str] = json.load(f) n = len(self.technique_names) if self._thr.shape[0] != n: raise ValueError( f"LogReg thresholds length {self._thr.shape[0]} != {n} labels" ) # ------------------------------------------------------------------ def predict(self, embedding_1d: np.ndarray) -> list[dict]: """ Run LogReg inference exactly as in notebook cell 19. Returns list of dicts sorted by confidence_percent desc: technique_id, predicted, confidence_percent, proba, threshold, gap """ X_user = embedding_1d.reshape(1, -1) # --- verbatim from notebook cell 19 --- proba = self._model.predict_proba(X_user) if isinstance(proba, list): proba = np.column_stack([p[:, 1] for p in proba]) elif proba.ndim == 3: proba = proba[:, :, 1] proba = proba.reshape(-1) pred_logreg = (proba >= self._thr).astype(int) conf_logreg = proba * 100 gap_logreg = proba - self._thr # --- end verbatim --- results = [ { "technique_id": self.technique_names[i], "predicted": bool(pred_logreg[i]), "confidence_percent": round(float(conf_logreg[i]), 2), "proba": round(float(proba[i]), 4), "threshold": round(float(self._thr[i]), 4), "gap": round(float(gap_logreg[i]), 4), } for i in range(len(self.technique_names)) ] # sort: predicted first, then by confidence desc (notebook sort logic) return sorted( results, key=lambda r: (r["predicted"], r["confidence_percent"]), reverse=True, )