File size: 3,787 Bytes
26e1c2e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
"""
Logistic Regression — PRIMARY model per user decision.

Inference logic extracted VERBATIM from MurshidUIPipeline.ipynb (cell 18-19):

    logreg_model = joblib.load(f"{BASE_PATH}/murshid_logreg_pipeline_manual_oof_pcatuned.joblib")
    logreg_thr   = np.load(f"{BASE_PATH}/murshid_logreg_thresholds_manual_oof_pcatuned.npy")

    proba = logreg_model.predict_proba(X_user)

    if isinstance(proba, list):
        proba = np.column_stack([p[:, 1] for p in proba])
    elif proba.ndim == 3:
        proba = proba[:, :, 1]

    proba = proba.reshape(-1)

    pred_logreg = (proba >= logreg_thr).astype(int)
    conf_logreg = proba * 100
    gap_logreg  = proba - logreg_thr

Original notebook file is NOT modified.
"""

from __future__ import annotations

import json
from pathlib import Path

import joblib
import numpy as np

from app.config import settings


class LogisticRegressionModel:
    """
    Wraps the trained Logistic Regression pipeline + per-label thresholds.
    File structure (from notebook cell 18):
        logreg_model  → sklearn Pipeline (PCA-tuned + OneVsRestClassifier(LogReg))
        logreg_thr    → np.ndarray shape (n_techniques,)  per-label thresholds
    """

    def __init__(self, models_dir: Path | None = None) -> None:
        base = Path(models_dir or settings.murshid_models_dir).resolve()

        logreg_path  = base / settings.logreg_joblib
        thr_path     = base / settings.logreg_thresholds_npy
        labels_path  = base / settings.label_columns_json

        for p in (logreg_path, thr_path, labels_path):
            if not p.is_file():
                raise FileNotFoundError(f"Missing model file: {p}")

        # --- notebook cell 18: load model + thresholds ---
        self._model    = joblib.load(logreg_path)   # logreg_model
        self._thr      = np.load(thr_path)           # logreg_thr

        with open(labels_path, encoding="utf-8") as f:
            self.technique_names: list[str] = json.load(f)

        n = len(self.technique_names)
        if self._thr.shape[0] != n:
            raise ValueError(
                f"LogReg thresholds length {self._thr.shape[0]} != {n} labels"
            )

    # ------------------------------------------------------------------

    def predict(self, embedding_1d: np.ndarray) -> list[dict]:
        """
        Run LogReg inference exactly as in notebook cell 19.

        Returns list of dicts sorted by confidence_percent desc:
            technique_id, predicted, confidence_percent, proba, threshold, gap
        """
        X_user = embedding_1d.reshape(1, -1)

        # --- verbatim from notebook cell 19 ---
        proba = self._model.predict_proba(X_user)

        if isinstance(proba, list):
            proba = np.column_stack([p[:, 1] for p in proba])
        elif proba.ndim == 3:
            proba = proba[:, :, 1]

        proba = proba.reshape(-1)

        pred_logreg = (proba >= self._thr).astype(int)
        conf_logreg = proba * 100
        gap_logreg  = proba - self._thr
        # --- end verbatim ---

        results = [
            {
                "technique_id":       self.technique_names[i],
                "predicted":          bool(pred_logreg[i]),
                "confidence_percent": round(float(conf_logreg[i]), 2),
                "proba":              round(float(proba[i]), 4),
                "threshold":          round(float(self._thr[i]), 4),
                "gap":                round(float(gap_logreg[i]), 4),
            }
            for i in range(len(self.technique_names))
        ]

        # sort: predicted first, then by confidence desc  (notebook sort logic)
        return sorted(
            results,
            key=lambda r: (r["predicted"], r["confidence_percent"]),
            reverse=True,
        )