Spaces:
Sleeping
Sleeping
File size: 7,849 Bytes
53b92fc 85d3bd6 53b92fc | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 | """
utils/scenario_engine_ng.py โ Version 2.1
Upgrades:
โข Reads feature_metadata.json for range/type validation
โข Clips values automatically to safe ranges
โข Adds structured error handling & clean audit output
"""
import re
import json
import pandas as pd
from typing import List, Dict, Any, Tuple, Optional
from utils.models import load_model # your existing loader
# ------------------------------------------------------------
# ๐ Load Feature Metadata
# ------------------------------------------------------------
def _load_metadata(path: str = "data/feature_metadata.json") -> Dict[str, Any]:
try:
with open(path, "r") as f:
return json.load(f)
except FileNotFoundError:
print("โ ๏ธ Metadata file not found, proceeding without validation.")
return {}
except Exception as e:
print(f"โ ๏ธ Could not read metadata: {e}")
return {}
FEATURE_META = _load_metadata()
# ------------------------------------------------------------
# ๐ข Regex helpers for numeric parsing
# ------------------------------------------------------------
PCT_RE = re.compile(r"^\s*([+-]?\d+(\.\d+)?)\s*%\s*$")
NUM_RE = re.compile(r"^\s*([+-]?\d+(\.\d+)?)\s*$")
def _parse_value(v: Any) -> Tuple[str, float]:
"""Parse a string like '+10%', '-5', or '1.2' โ ('percent'|'absolute', number)"""
s = str(v).strip().lower()
if m := PCT_RE.match(s):
return ("percent", float(m.group(1)) / 100.0)
if m := NUM_RE.match(s):
return ("absolute", float(m.group(1)))
# last-resort float extraction
nums = re.findall(r"[-+]?\d*\.\d+|[-+]?\d+", s)
if nums:
return ("absolute", float(nums[0]))
raise ValueError(f"Unsupported value format: {v!r}")
# ------------------------------------------------------------
# ๐งฎ Validation Helpers
# ------------------------------------------------------------
def _ensure_numeric(df: pd.DataFrame, col: str):
if col not in df.columns:
raise KeyError(f"Column '{col}' not in dataset.")
if not pd.api.types.is_numeric_dtype(df[col]):
raise TypeError(f"Column '{col}' must be numeric; got dtype {df[col].dtype}.")
def _subset(df: pd.DataFrame, where: Optional[str]) -> pd.Index:
if not where:
return df.index
try:
return df.query(where).index
except Exception as e:
raise ValueError(f"Invalid filter: {where!r} โ {e}")
def _apply_metadata_limits(df: pd.DataFrame, col: str):
"""Clip column values based on metadata min/max."""
meta = FEATURE_META.get(col)
if not meta or "min" not in meta or "max" not in meta:
return
before = df[col].copy()
df[col] = df[col].clip(lower=meta["min"], upper=meta["max"])
if not before.equals(df[col]):
print(f"๐ '{col}' clipped to range [{meta['min']}, {meta['max']}]")
return df
# ------------------------------------------------------------
# โ๏ธ Apply a Single Operation
# ------------------------------------------------------------
def _apply_op(df: pd.DataFrame, op: Dict[str, Any]) -> Dict[str, Any]:
"""
Apply a single operation {'op','col','value','where?','min?','max?'}.
Returns a small audit dict describing the change.
"""
kind = op.get("op")
col = op.get("col")
val = op.get("value")
where = op.get("where")
if kind not in {"scale", "shift", "set", "clip"}:
raise ValueError(f"Unsupported op '{kind}'.")
if not col:
raise ValueError("Missing 'col' in operation.")
idx = _subset(df, where)
_ensure_numeric(df, col)
# ----- Clip -----
if kind == "clip":
min_v = op.get("min")
max_v = op.get("max")
before = df.loc[idx, col].copy()
df.loc[idx, col] = df.loc[idx, col].clip(lower=min_v, upper=max_v)
_apply_metadata_limits(df, col)
return {
"op": kind,
"col": col,
"where": where,
"count": len(idx),
"min": min_v,
"max": max_v,
"delta_mean": float(df.loc[idx, col].mean() - before.mean()),
}
# ----- Scale / Shift / Set -----
mode, num = _parse_value(val)
before = df.loc[idx, col].copy()
if kind == "scale":
factor = (1.0 + num) if mode == "percent" else float(num)
df.loc[idx, col] = (df.loc[idx, col].astype(float) * factor).astype(df[col].dtype)
elif kind == "shift":
shift = num if mode == "absolute" else df.loc[idx, col] * num
df.loc[idx, col] += shift
elif kind == "set":
if mode != "absolute":
raise ValueError("For 'set', provide a numeric value (e.g., 3.2), not a percent.")
df.loc[idx, col] = float(num)
# Apply metadata clipping (safety net)
_apply_metadata_limits(df, col)
after = df.loc[idx, col]
return {
"op": kind,
"col": col,
"where": where,
"value": val,
"count": len(idx),
"before_mean": float(before.mean()) if len(before) else None,
"after_mean": float(after.mean()) if len(after) else None,
"delta_mean": float(after.mean() - before.mean()) if len(after) else None,
}
# ------------------------------------------------------------
# ๐ Model Feature Utilities
# ------------------------------------------------------------
def _expected_features(model, df: pd.DataFrame, target: str = "churn") -> List[str]:
"""Return model features using .feature_names_in_ or numeric fallback."""
names = getattr(model, "feature_names_in_", None)
if names is not None and len(names):
return list(names)
numeric_cols = list(df.select_dtypes(include="number").columns)
bad = {target, "userid", "user_id", "id", "label"}
return [c for c in numeric_cols if c not in bad]
# ------------------------------------------------------------
# ๐ Public Simulation API
# ------------------------------------------------------------
def simulate_plan(
plan: List[Dict[str, Any]],
data_path: str = "data/data_randomforest.csv",
model_path: str = "app_best.joblib",
target_col: str = "churn",
) -> Dict[str, Any]:
"""
Apply a list of generic operations to the dataset, then recompute churn with the trained model.
"""
df = pd.read_csv(data_path)
model = load_model(model_path)
feats = _expected_features(model, df, target=target_col)
# --- Baseline ---
try:
X0 = df[feats]
base_prob = model.predict_proba(X0)[:, 1]
base_rate = float(base_prob.mean() * 100)
except Exception as e:
return {"summary": f"โ ๏ธ Baseline prediction error: {e}", "df": df}
# --- Apply Ops ---
audit = []
try:
for i, op in enumerate(plan, 1):
res = _apply_op(df, op)
res["index"] = i
audit.append(res)
except Exception as e:
return {"summary": f"โ ๏ธ Plan application error: {e}", "df": df, "audit": audit}
# --- Post-change Predictions ---
try:
X1 = df[feats]
new_prob = model.predict_proba(X1)[:, 1]
new_rate = float(new_prob.mean() * 100)
except Exception as e:
return {"summary": f"โ ๏ธ Post-change prediction error: {e}", "df": df, "audit": audit}
# --- Summary ---
delta = new_rate - base_rate
dir_emoji = "๐" if delta < 0 else "๐" if delta > 0 else "โ"
summary = (
f"{dir_emoji} Churn changed from {base_rate:.2f}% โ {new_rate:.2f}% "
f"({delta:+.2f} pts) after applying {len(plan)} operation(s)."
)
return {
"summary": summary,
"df": df,
"audit": audit,
"metrics": {
"baseline_churn_rate": base_rate,
"new_churn_rate": new_rate,
"delta_churn_rate": delta,
},
"model_features_used": feats,
}
|