File size: 7,849 Bytes
53b92fc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
85d3bd6
53b92fc
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
"""
utils/scenario_engine_ng.py  โ€”  Version 2.1
Upgrades:
โ€ข Reads feature_metadata.json for range/type validation
โ€ข Clips values automatically to safe ranges
โ€ข Adds structured error handling & clean audit output
"""

import re
import json
import pandas as pd
from typing import List, Dict, Any, Tuple, Optional
from utils.models import load_model  # your existing loader


# ------------------------------------------------------------
# ๐Ÿ”– Load Feature Metadata
# ------------------------------------------------------------
def _load_metadata(path: str = "data/feature_metadata.json") -> Dict[str, Any]:
    try:
        with open(path, "r") as f:
            return json.load(f)
    except FileNotFoundError:
        print("โš ๏ธ Metadata file not found, proceeding without validation.")
        return {}
    except Exception as e:
        print(f"โš ๏ธ Could not read metadata: {e}")
        return {}


FEATURE_META = _load_metadata()


# ------------------------------------------------------------
# ๐Ÿ”ข Regex helpers for numeric parsing
# ------------------------------------------------------------
PCT_RE = re.compile(r"^\s*([+-]?\d+(\.\d+)?)\s*%\s*$")
NUM_RE = re.compile(r"^\s*([+-]?\d+(\.\d+)?)\s*$")


def _parse_value(v: Any) -> Tuple[str, float]:
    """Parse a string like '+10%', '-5', or '1.2' โ†’ ('percent'|'absolute', number)"""
    s = str(v).strip().lower()
    if m := PCT_RE.match(s):
        return ("percent", float(m.group(1)) / 100.0)
    if m := NUM_RE.match(s):
        return ("absolute", float(m.group(1)))
    # last-resort float extraction
    nums = re.findall(r"[-+]?\d*\.\d+|[-+]?\d+", s)
    if nums:
        return ("absolute", float(nums[0]))
    raise ValueError(f"Unsupported value format: {v!r}")


# ------------------------------------------------------------
# ๐Ÿงฎ Validation Helpers
# ------------------------------------------------------------
def _ensure_numeric(df: pd.DataFrame, col: str):
    if col not in df.columns:
        raise KeyError(f"Column '{col}' not in dataset.")
    if not pd.api.types.is_numeric_dtype(df[col]):
        raise TypeError(f"Column '{col}' must be numeric; got dtype {df[col].dtype}.")


def _subset(df: pd.DataFrame, where: Optional[str]) -> pd.Index:
    if not where:
        return df.index
    try:
        return df.query(where).index
    except Exception as e:
        raise ValueError(f"Invalid filter: {where!r} โ†’ {e}")


def _apply_metadata_limits(df: pd.DataFrame, col: str):
    """Clip column values based on metadata min/max."""
    meta = FEATURE_META.get(col)
    if not meta or "min" not in meta or "max" not in meta:
        return
    before = df[col].copy()
    df[col] = df[col].clip(lower=meta["min"], upper=meta["max"])
    if not before.equals(df[col]):
        print(f"๐Ÿ”’ '{col}' clipped to range [{meta['min']}, {meta['max']}]")
    return df


# ------------------------------------------------------------
# โš™๏ธ Apply a Single Operation
# ------------------------------------------------------------
def _apply_op(df: pd.DataFrame, op: Dict[str, Any]) -> Dict[str, Any]:
    """
    Apply a single operation {'op','col','value','where?','min?','max?'}.
    Returns a small audit dict describing the change.
    """
    kind = op.get("op")
    col = op.get("col")
    val = op.get("value")
    where = op.get("where")

    if kind not in {"scale", "shift", "set", "clip"}:
        raise ValueError(f"Unsupported op '{kind}'.")
    if not col:
        raise ValueError("Missing 'col' in operation.")

    idx = _subset(df, where)
    _ensure_numeric(df, col)

    # ----- Clip -----
    if kind == "clip":
        min_v = op.get("min")
        max_v = op.get("max")
        before = df.loc[idx, col].copy()
        df.loc[idx, col] = df.loc[idx, col].clip(lower=min_v, upper=max_v)
        _apply_metadata_limits(df, col)
        return {
            "op": kind,
            "col": col,
            "where": where,
            "count": len(idx),
            "min": min_v,
            "max": max_v,
            "delta_mean": float(df.loc[idx, col].mean() - before.mean()),
        }

    # ----- Scale / Shift / Set -----
    mode, num = _parse_value(val)
    before = df.loc[idx, col].copy()

    if kind == "scale":
        factor = (1.0 + num) if mode == "percent" else float(num)
        df.loc[idx, col] = (df.loc[idx, col].astype(float) * factor).astype(df[col].dtype)

    elif kind == "shift":
        shift = num if mode == "absolute" else df.loc[idx, col] * num
        df.loc[idx, col] += shift

    elif kind == "set":
        if mode != "absolute":
            raise ValueError("For 'set', provide a numeric value (e.g., 3.2), not a percent.")
        df.loc[idx, col] = float(num)

    # Apply metadata clipping (safety net)
    _apply_metadata_limits(df, col)

    after = df.loc[idx, col]
    return {
        "op": kind,
        "col": col,
        "where": where,
        "value": val,
        "count": len(idx),
        "before_mean": float(before.mean()) if len(before) else None,
        "after_mean": float(after.mean()) if len(after) else None,
        "delta_mean": float(after.mean() - before.mean()) if len(after) else None,
    }


# ------------------------------------------------------------
# ๐Ÿ“Š Model Feature Utilities
# ------------------------------------------------------------
def _expected_features(model, df: pd.DataFrame, target: str = "churn") -> List[str]:
    """Return model features using .feature_names_in_ or numeric fallback."""
    names = getattr(model, "feature_names_in_", None)
    if names is not None and len(names):
        return list(names)
    numeric_cols = list(df.select_dtypes(include="number").columns)
    bad = {target, "userid", "user_id", "id", "label"}
    return [c for c in numeric_cols if c not in bad]


# ------------------------------------------------------------
# ๐Ÿš€ Public Simulation API
# ------------------------------------------------------------
def simulate_plan(
    plan: List[Dict[str, Any]],
    data_path: str = "data/data_randomforest.csv",
    model_path: str = "app_best.joblib",
    target_col: str = "churn",
) -> Dict[str, Any]:
    """
    Apply a list of generic operations to the dataset, then recompute churn with the trained model.
    """
    df = pd.read_csv(data_path)
    model = load_model(model_path)
    feats = _expected_features(model, df, target=target_col)

    # --- Baseline ---
    try:
        X0 = df[feats]
        base_prob = model.predict_proba(X0)[:, 1]
        base_rate = float(base_prob.mean() * 100)
    except Exception as e:
        return {"summary": f"โš ๏ธ Baseline prediction error: {e}", "df": df}

    # --- Apply Ops ---
    audit = []
    try:
        for i, op in enumerate(plan, 1):
            res = _apply_op(df, op)
            res["index"] = i
            audit.append(res)
    except Exception as e:
        return {"summary": f"โš ๏ธ Plan application error: {e}", "df": df, "audit": audit}

    # --- Post-change Predictions ---
    try:
        X1 = df[feats]
        new_prob = model.predict_proba(X1)[:, 1]
        new_rate = float(new_prob.mean() * 100)
    except Exception as e:
        return {"summary": f"โš ๏ธ Post-change prediction error: {e}", "df": df, "audit": audit}

    # --- Summary ---
    delta = new_rate - base_rate
    dir_emoji = "๐Ÿ“‰" if delta < 0 else "๐Ÿ“ˆ" if delta > 0 else "โž–"
    summary = (
        f"{dir_emoji} Churn changed from {base_rate:.2f}% โ†’ {new_rate:.2f}% "
        f"({delta:+.2f} pts) after applying {len(plan)} operation(s)."
    )

    return {
        "summary": summary,
        "df": df,
        "audit": audit,
        "metrics": {
            "baseline_churn_rate": base_rate,
            "new_churn_rate": new_rate,
            "delta_churn_rate": delta,
        },
        "model_features_used": feats,
    }