murshid / murshid_backend /app /ml /logistic_model.py
devorbit's picture
Initial deployment - secrets removed
26e1c2e
"""
Logistic Regression — PRIMARY model per user decision.
Inference logic extracted VERBATIM from MurshidUIPipeline.ipynb (cell 18-19):
logreg_model = joblib.load(f"{BASE_PATH}/murshid_logreg_pipeline_manual_oof_pcatuned.joblib")
logreg_thr = np.load(f"{BASE_PATH}/murshid_logreg_thresholds_manual_oof_pcatuned.npy")
proba = logreg_model.predict_proba(X_user)
if isinstance(proba, list):
proba = np.column_stack([p[:, 1] for p in proba])
elif proba.ndim == 3:
proba = proba[:, :, 1]
proba = proba.reshape(-1)
pred_logreg = (proba >= logreg_thr).astype(int)
conf_logreg = proba * 100
gap_logreg = proba - logreg_thr
Original notebook file is NOT modified.
"""
from __future__ import annotations
import json
from pathlib import Path
import joblib
import numpy as np
from app.config import settings
class LogisticRegressionModel:
"""
Wraps the trained Logistic Regression pipeline + per-label thresholds.
File structure (from notebook cell 18):
logreg_model → sklearn Pipeline (PCA-tuned + OneVsRestClassifier(LogReg))
logreg_thr → np.ndarray shape (n_techniques,) per-label thresholds
"""
def __init__(self, models_dir: Path | None = None) -> None:
base = Path(models_dir or settings.murshid_models_dir).resolve()
logreg_path = base / settings.logreg_joblib
thr_path = base / settings.logreg_thresholds_npy
labels_path = base / settings.label_columns_json
for p in (logreg_path, thr_path, labels_path):
if not p.is_file():
raise FileNotFoundError(f"Missing model file: {p}")
# --- notebook cell 18: load model + thresholds ---
self._model = joblib.load(logreg_path) # logreg_model
self._thr = np.load(thr_path) # logreg_thr
with open(labels_path, encoding="utf-8") as f:
self.technique_names: list[str] = json.load(f)
n = len(self.technique_names)
if self._thr.shape[0] != n:
raise ValueError(
f"LogReg thresholds length {self._thr.shape[0]} != {n} labels"
)
# ------------------------------------------------------------------
def predict(self, embedding_1d: np.ndarray) -> list[dict]:
"""
Run LogReg inference exactly as in notebook cell 19.
Returns list of dicts sorted by confidence_percent desc:
technique_id, predicted, confidence_percent, proba, threshold, gap
"""
X_user = embedding_1d.reshape(1, -1)
# --- verbatim from notebook cell 19 ---
proba = self._model.predict_proba(X_user)
if isinstance(proba, list):
proba = np.column_stack([p[:, 1] for p in proba])
elif proba.ndim == 3:
proba = proba[:, :, 1]
proba = proba.reshape(-1)
pred_logreg = (proba >= self._thr).astype(int)
conf_logreg = proba * 100
gap_logreg = proba - self._thr
# --- end verbatim ---
results = [
{
"technique_id": self.technique_names[i],
"predicted": bool(pred_logreg[i]),
"confidence_percent": round(float(conf_logreg[i]), 2),
"proba": round(float(proba[i]), 4),
"threshold": round(float(self._thr[i]), 4),
"gap": round(float(gap_logreg[i]), 4),
}
for i in range(len(self.technique_names))
]
# sort: predicted first, then by confidence desc (notebook sort logic)
return sorted(
results,
key=lambda r: (r["predicted"], r["confidence_percent"]),
reverse=True,
)