File size: 15,084 Bytes
5a264f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8414ee3
5a264f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0f28ff9
5a264f5
0f28ff9
5a264f5
0f28ff9
5a264f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
"""
ml_engine.py
────────────
Wraps the psychometric tabular model and the RoBERTa text model into a
single predict() function that returns a fused 5-class stress result.

On first import the models are loaded once and cached globally.
If a model file is not found the module operates in DEMO mode, returning
plausible random predictions so the web app can run without GPU weights.
"""

from __future__ import annotations

import os
import re
import logging
import warnings
import numpy as np
import pandas as pd

warnings.filterwarnings("ignore")
logger = logging.getLogger(__name__)

# ── Label architecture ───────────────────────────────────────────────────────
PSYCHO_SCORE = {"High": 1.00, "Medium": 0.50, "Low": 0.00}

TEXT_SCORE = {
    "Normal": 0.00,
    "Stress": 0.45,
    "Personality disorder": 0.60,
    "Bipolar": 0.65,
    "Anxiety": 0.70,
    "Depression": 0.80,
    "Suicidal": 1.00,
}

FUSED_BINS   = [0.0, 0.2, 0.4, 0.6, 0.8, 1.001]
FUSED_LABELS = ["Minimal", "Mild", "Moderate", "Severe", "Critical"]
TEXT_CLASSES = sorted(TEXT_SCORE.keys())
PSYCHO_CLASSES_DEFAULT = ["High", "Low", "Medium"]   # sorted order as LabelEncoder would produce


def _score_to_fused(score: float) -> str:
    for lo, hi, lbl in zip(FUSED_BINS[:-1], FUSED_BINS[1:], FUSED_LABELS):
        if lo <= score < hi:
            return lbl
    return FUSED_LABELS[-1]


# ── Model artefacts (lazy-loaded) ────────────────────────────────────────────
_psycho_model   = None
_base_scaler    = None
_final_scaler   = None
_le_dict        = None
_le_target      = None
_selected_cols  = None
_poly           = None
_top_num        = None
_loaded_model_name = ""

_roberta_model  = None
_tokenizer      = None

DEMO_MODE = False          # flips to True if weights are missing


def _load_psycho(model_dir: str) -> bool:
    global _psycho_model, _base_scaler, _final_scaler, _le_dict, _le_target
    global _selected_cols, _poly, _top_num, _loaded_model_name

    try:
        import joblib
        _base_scaler   = joblib.load(os.path.join(model_dir, "base_scaler.pkl"))
        _final_scaler  = joblib.load(os.path.join(model_dir, "final_scaler.pkl"))
        _le_dict       = joblib.load(os.path.join(model_dir, "le_dict.pkl"))
        _le_target     = joblib.load(os.path.join(model_dir, "le_target.pkl"))
        _selected_cols = joblib.load(os.path.join(model_dir, "selected_cols.pkl"))
        _poly          = joblib.load(os.path.join(model_dir, "poly.pkl"))
        _top_num       = joblib.load(os.path.join(model_dir, "top_num.pkl"))

        for candidate in [
            "stacking_ensemble_best_model.pkl",
            "lightgbm_best_model.pkl",
            "catboost_best_model.pkl",
            "xgboost_best_model.pkl",
            "random_forest_best_model.pkl",
            "mlp_sklearn_best_model.pkl",
        ]:
            p = os.path.join(model_dir, candidate)
            if os.path.exists(p):
                _psycho_model      = joblib.load(p)
                _loaded_model_name = candidate
                logger.info("Loaded psychometric model: %s", candidate)
                return True
        logger.warning("Psychometric model pkl not found in %s", model_dir)
    except Exception as exc:
        logger.warning("Failed to load psychometric model: %s", exc)
    return False


def _load_roberta(ckpt_path: str) -> bool:
    global _roberta_model, _tokenizer

    try:
        import torch
        import torch.nn as nn
        from transformers import AutoTokenizer, AutoModel

        MODEL_NAME = "roberta-base"

        class RobertaClassifier(nn.Module):
            def __init__(self):
                super().__init__()
                self.roberta = AutoModel.from_pretrained(MODEL_NAME)
                self.dropout = nn.Dropout(0.3)
                self.fc      = nn.Linear(self.roberta.config.hidden_size, 7)
                for p in self.roberta.parameters():
                    p.requires_grad = False
                for layer in self.roberta.encoder.layer[-3:]:
                    for p in layer.parameters():
                        p.requires_grad = True

            def forward(self, input_ids, attention_mask):
                out     = self.roberta(input_ids=input_ids,
                                       attention_mask=attention_mask)
                cls_out = out.last_hidden_state[:, 0]
                return self.fc(self.dropout(cls_out))

        device = "cuda" if __import__("torch").cuda.is_available() else "cpu"
        model  = RobertaClassifier().to(device)
        model.load_state_dict(
            __import__("torch").load(ckpt_path, map_location=device)
        )
        model.eval()
        _roberta_model = model
        _tokenizer     = AutoTokenizer.from_pretrained(MODEL_NAME)
        logger.info("Loaded RoBERTa model from %s", ckpt_path)
        return True
    except Exception as exc:
        logger.warning("Failed to load RoBERTa model: %s", exc)
    return False


def init_models(psycho_model_dir: str, roberta_ckpt: str) -> None:
    global DEMO_MODE
    # Resolve relative paths from the project root (where app.py lives)
    _root = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
    if psycho_model_dir and not os.path.isabs(psycho_model_dir):
        psycho_model_dir = os.path.join(_root, psycho_model_dir)
    if roberta_ckpt and not os.path.isabs(roberta_ckpt):
        roberta_ckpt = os.path.join(_root, roberta_ckpt)
    ok_p = _load_psycho(psycho_model_dir)
    ok_r = _load_roberta(roberta_ckpt)
    if not ok_p and not ok_r:
        DEMO_MODE = True
        logger.warning("Both models unavailable β€” running in DEMO mode.")
    elif not ok_p:
        logger.warning("Psychometric model unavailable β€” text-only mode.")
    elif not ok_r:
        logger.warning("RoBERTa model unavailable β€” psychometric-only mode.")


# ── Feature engineering (mirrors notebook) ──────────────────────────────────

def _add_interactions(X: pd.DataFrame) -> pd.DataFrame:
    X = X.copy()
    if {"Sleep_Duration", "Sleep_Quality"}.issubset(X.columns):
        X["sleep_score"]         = X["Sleep_Duration"] * X["Sleep_Quality"]
    if {"Sleep_Quality", "Screen_Time"}.issubset(X.columns):
        X["screen_sleep_ratio"]  = X["Screen_Time"] / (X["Sleep_Quality"] + 1e-6)
    if {"Physical_Activity", "Work_Hours"}.issubset(X.columns):
        X["activity_work_ratio"] = X["Physical_Activity"] / (X["Work_Hours"] + 1e-6)
    if {"Social_Interactions", "Travel_Time"}.issubset(X.columns):
        X["social_travel"]       = X["Social_Interactions"] * X["Travel_Time"]
    if {"Work_Hours", "Travel_Time", "Screen_Time"}.issubset(X.columns):
        X["daily_burden"]        = X["Work_Hours"] + X["Travel_Time"] + X["Screen_Time"]
    if {"Blood_Pressure", "Cholesterol_Level", "Blood_Sugar_Level"}.issubset(X.columns):
        X["cardio_risk"]         = (X["Blood_Pressure"] + X["Cholesterol_Level"]
                                    + X["Blood_Sugar_Level"])
    if {"Caffeine_Intake", "Alcohol_Intake"}.issubset(X.columns):
        X["stimulant_load"]      = X["Caffeine_Intake"] + 2 * X["Alcohol_Intake"]
    if {"Physical_Activity", "Sleep_Duration"}.issubset(X.columns):
        X["recovery_index"]      = X["Physical_Activity"] + X["Sleep_Duration"]
    return X


def _preprocess_psycho(raw_df: pd.DataFrame) -> np.ndarray:
    """Returns proba array (N, 3) in le_target.classes_ order."""
    df = raw_df.copy()
    if "Stress_Detection" in df.columns:
        df = df.drop(columns=["Stress_Detection"])

    for col, le in _le_dict.items():
        if col in df.columns:
            df[col] = le.transform(df[col])

    if hasattr(_base_scaler, "feature_names_in_"):
        for c in _base_scaler.feature_names_in_:
            if c not in df.columns:
                df[c] = 0.0
        df = df[_base_scaler.feature_names_in_]

    df_inter = _add_interactions(df)
    poly_arr = _poly.transform(df_inter[_top_num])
    poly_cols = [f"poly_{i}" for i in range(poly_arr.shape[1])]
    df_poly   = pd.concat(
        [df_inter.reset_index(drop=True),
         pd.DataFrame(poly_arr, columns=poly_cols)], axis=1
    )
    df_sel = df_poly[_selected_cols].copy()

    needs_scale = {"mlp_sklearn_best_model.pkl", "svm-rbf_best_model.pkl",
                   "logreg_best_model.pkl"}
    if _loaded_model_name in needs_scale:
        df_sel = pd.DataFrame(
            _final_scaler.transform(df_sel), columns=_selected_cols
        )

    return _psycho_model.predict_proba(df_sel)


def _get_text_proba(text: str) -> np.ndarray:
    """Returns proba array (7,) in TEXT_CLASSES order."""
    import torch
    import torch.nn.functional as F

    device = next(_roberta_model.parameters()).device
    enc = _tokenizer(
        re.sub(r"[^a-zA-Z\s]", "", text.lower()),
        padding="max_length", truncation=True,
        max_length=128, return_tensors="pt",
    )
    ids  = enc["input_ids"].to(device)
    mask = enc["attention_mask"].to(device)
    with torch.no_grad():
        logits = _roberta_model(ids, mask)
    return F.softmax(logits, dim=-1).cpu().numpy()[0]


# ── Public inference entry point ─────────────────────────────────────────────

def predict(
    psychometric_row: dict | None = None,
    text_note: str | None         = None,
    psycho_weight: float          = 0.5,
) -> dict:
    """
    Run stress prediction.

    Parameters
    ----------
    psychometric_row : dict mapping feature-name β†’ value (or None)
    text_note        : raw text string (or None)
    psycho_weight    : weight given to psychometric score [0..1]

    Returns
    -------
    dict with keys:
        psycho_label, psycho_score, text_label, text_score,
        fused_label, fused_score, modality_used
    """
    if DEMO_MODE:
        return _demo_predict(psychometric_row, text_note)

    psycho_score = None
    psycho_label = None
    text_score   = None
    text_label   = None

    # ── Psychometric branch ─────────────────────────────────────────────────
    if psychometric_row and _psycho_model is not None:
        try:
            df_row = pd.DataFrame([psychometric_row])
            proba  = _preprocess_psycho(df_row)[0]          # shape (3,)
            classes = list(_le_target.classes_)
            psycho_score = sum(PSYCHO_SCORE[c] * p
                               for c, p in zip(classes, proba))
            psycho_label = classes[int(np.argmax(proba))]
        except Exception as exc:
            logger.error("Psychometric inference error: %s", exc)

    # ── Text branch ─────────────────────────────────────────────────────────
    if text_note and text_note.strip() and _roberta_model is not None:
        try:
            proba      = _get_text_proba(text_note)          # shape (7,)
            text_score = sum(TEXT_SCORE[c] * p
                             for c, p in zip(TEXT_CLASSES, proba))
            text_label = TEXT_CLASSES[int(np.argmax(proba))]
        except Exception as exc:
            logger.error("Text inference error: %s", exc)

    # ── Fusion ───────────────────────────────────────────────────────────────
    tw = 1.0 - psycho_weight

    if psycho_score is not None and text_score is not None:
        fused_score   = psycho_weight * psycho_score + tw * text_score
        modality_used = "both"
    elif psycho_score is not None:
        fused_score   = float(psycho_score)
        modality_used = "psycho"
    elif text_score is not None:
        fused_score   = float(text_score)
        modality_used = "text"
    else:
        fused_score   = 0.0
        modality_used = "none"

    fused_label = _score_to_fused(fused_score)

    return {
        "psycho_label":  psycho_label,
        "psycho_score":  round(float(psycho_score), 4) if psycho_score is not None else None,
        "text_label":    text_label,
        "text_score":    round(float(text_score), 4) if text_score is not None else None,
        "fused_label":   fused_label,
        "fused_score":   round(float(fused_score), 4),
        "modality_used": modality_used,
    }


# ── Demo / fallback (no weights needed) ─────────────────────────────────────

def _demo_predict(psychometric_row, text_note) -> dict:
    """Rule-based heuristic demo prediction β€” no ML weights required."""
    score = 0.3   # baseline: mild

    if psychometric_row:
        sh  = float(psychometric_row.get("Sleep_Duration", 7))
        pa  = float(psychometric_row.get("Physical_Activity", 3))
        wh  = float(psychometric_row.get("Work_Hours", 8))
        caf = float(psychometric_row.get("Caffeine_Intake", 2))
        alc = float(psychometric_row.get("Alcohol_Intake", 0))
        slq = float(psychometric_row.get("Sleep_Quality", 3))

        score += max(0, (8 - sh) * 0.04)          # less sleep β†’ more stress
        score += max(0, (wh - 8) * 0.03)          # overwork
        score -= pa * 0.02                         # activity lowers stress
        score += caf * 0.01
        score += alc * 0.02
        score -= slq * 0.015
        score = float(np.clip(score, 0, 1))

    psycho_label = ("High" if score >= 0.6
                    else "Medium" if score >= 0.3 else "Low")

    text_score  = None
    text_label  = None

    KEYWORDS = {
        "suicid": 0.95, "depress": 0.78, "anxiet": 0.68,
        "panic": 0.65, "overwhelm": 0.55, "stress": 0.45,
        "tired": 0.40, "exhaust": 0.50, "happy": 0.05, "fine": 0.1,
    }
    if text_note and text_note.strip():
        t = text_note.lower()
        ts = 0.3
        for kw, s in KEYWORDS.items():
            if kw in t:
                ts = max(ts, s)
        text_score = float(np.clip(ts, 0, 1))
        text_label = TEXT_CLASSES[min(
            int(text_score / (1.0 / 7)),
            len(TEXT_CLASSES) - 1
        )]

    if text_score is not None:
        fused = 0.5 * score + 0.5 * text_score
        modality = "both"
    else:
        fused = score
        modality = "psycho" if psychometric_row else "none"

    return {
        "psycho_label":  psycho_label,
        "psycho_score":  round(score, 4),
        "text_label":    text_label,
        "text_score":    round(text_score, 4) if text_score is not None else None,
        "fused_label":   _score_to_fused(fused),
        "fused_score":   round(fused, 4),
        "modality_used": modality,
    }