Spaces:

devorbit
/

murshid

Paused

App Files Files Community

murshid / murshid_backend /app /ml /logistic_model.py

devorbit

Initial deployment - secrets removed

26e1c2e about 2 months ago

raw

history blame contribute delete

3.79 kB

	"""
	Logistic Regression — PRIMARY model per user decision.

	Inference logic extracted VERBATIM from MurshidUIPipeline.ipynb (cell 18-19):

	logreg_model = joblib.load(f"{BASE_PATH}/murshid_logreg_pipeline_manual_oof_pcatuned.joblib")
	logreg_thr = np.load(f"{BASE_PATH}/murshid_logreg_thresholds_manual_oof_pcatuned.npy")

	proba = logreg_model.predict_proba(X_user)

	if isinstance(proba, list):
	proba = np.column_stack([p[:, 1] for p in proba])
	elif proba.ndim == 3:
	proba = proba[:, :, 1]

	proba = proba.reshape(-1)

	pred_logreg = (proba >= logreg_thr).astype(int)
	conf_logreg = proba * 100
	gap_logreg = proba - logreg_thr

	Original notebook file is NOT modified.
	"""

	from __future__ import annotations

	import json
	from pathlib import Path

	import joblib
	import numpy as np

	from app.config import settings


	class LogisticRegressionModel:
	"""
	Wraps the trained Logistic Regression pipeline + per-label thresholds.
	File structure (from notebook cell 18):
	logreg_model → sklearn Pipeline (PCA-tuned + OneVsRestClassifier(LogReg))
	logreg_thr → np.ndarray shape (n_techniques,) per-label thresholds
	"""

	def __init__(self, models_dir: Path \| None = None) -> None:
	base = Path(models_dir or settings.murshid_models_dir).resolve()

	logreg_path = base / settings.logreg_joblib
	thr_path = base / settings.logreg_thresholds_npy
	labels_path = base / settings.label_columns_json

	for p in (logreg_path, thr_path, labels_path):
	if not p.is_file():
	raise FileNotFoundError(f"Missing model file: {p}")

	# --- notebook cell 18: load model + thresholds ---
	self._model = joblib.load(logreg_path) # logreg_model
	self._thr = np.load(thr_path) # logreg_thr

	with open(labels_path, encoding="utf-8") as f:
	self.technique_names: list[str] = json.load(f)

	n = len(self.technique_names)
	if self._thr.shape[0] != n:
	raise ValueError(
	f"LogReg thresholds length {self._thr.shape[0]} != {n} labels"
	)

	# ------------------------------------------------------------------

	def predict(self, embedding_1d: np.ndarray) -> list[dict]:
	"""
	Run LogReg inference exactly as in notebook cell 19.

	Returns list of dicts sorted by confidence_percent desc:
	technique_id, predicted, confidence_percent, proba, threshold, gap
	"""
	X_user = embedding_1d.reshape(1, -1)

	# --- verbatim from notebook cell 19 ---
	proba = self._model.predict_proba(X_user)

	if isinstance(proba, list):
	proba = np.column_stack([p[:, 1] for p in proba])
	elif proba.ndim == 3:
	proba = proba[:, :, 1]

	proba = proba.reshape(-1)

	pred_logreg = (proba >= self._thr).astype(int)
	conf_logreg = proba * 100
	gap_logreg = proba - self._thr
	# --- end verbatim ---

	results = [
	{
	"technique_id": self.technique_names[i],
	"predicted": bool(pred_logreg[i]),
	"confidence_percent": round(float(conf_logreg[i]), 2),
	"proba": round(float(proba[i]), 4),
	"threshold": round(float(self._thr[i]), 4),
	"gap": round(float(gap_logreg[i]), 4),
	}
	for i in range(len(self.technique_names))
	]

	# sort: predicted first, then by confidence desc (notebook sort logic)
	return sorted(
	results,
	key=lambda r: (r["predicted"], r["confidence_percent"]),
	reverse=True,
	)