GoshawkVortexAI commited on
Commit
0fd33e0
Β·
verified Β·
1 Parent(s): 96d72eb

Create model_backend.py

Browse files
Files changed (1) hide show
  1. model_backend.py +163 -0
model_backend.py ADDED
@@ -0,0 +1,163 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ model_backend.py β€” Gradient boosting abstraction for LightGBM / sklearn HGBM.
3
+
4
+ LightGBM (preferred):
5
+ pip install lightgbm
6
+ Set USE_LIGHTGBM = True below.
7
+
8
+ Fallback: sklearn HistGradientBoostingClassifier.
9
+ Same algorithm family, native NaN support, comparable speed.
10
+ Feature importances use permutation importance (val set).
11
+
12
+ Interface is identical regardless of backend:
13
+ .fit() β†’ trains + calibrates
14
+ .predict_win_prob() β†’ P(win) per row
15
+ .feature_importances_ β†’ normalized importance array
16
+ """
17
+
18
+ import numpy as np
19
+
20
+ try:
21
+ import lightgbm as lgb
22
+ _LGBM_AVAILABLE = True
23
+ except ImportError:
24
+ _LGBM_AVAILABLE = False
25
+
26
+ from sklearn.ensemble import HistGradientBoostingClassifier
27
+ from sklearn.calibration import CalibratedClassifierCV
28
+ from sklearn.inspection import permutation_importance
29
+
30
+ USE_LIGHTGBM = False # Set True after: pip install lightgbm
31
+
32
+
33
+ def _build_lgbm(p: dict):
34
+ return lgb.LGBMClassifier(
35
+ n_estimators = p.get("n_estimators", 400),
36
+ learning_rate = p.get("learning_rate", 0.03),
37
+ max_depth = p.get("max_depth", 5),
38
+ min_child_samples = p.get("min_samples_leaf", 40),
39
+ reg_lambda = p.get("l2_regularization", 2.0),
40
+ feature_fraction = p.get("max_features", 0.70),
41
+ subsample = 0.80,
42
+ subsample_freq = 1,
43
+ n_jobs = -1,
44
+ random_state = p.get("random_state", 42),
45
+ verbosity = -1,
46
+ objective = "binary",
47
+ metric = "binary_logloss",
48
+ early_stopping_rounds = p.get("early_stopping_rounds", 30),
49
+ )
50
+
51
+
52
+ def _build_hgbm(p: dict):
53
+ return HistGradientBoostingClassifier(
54
+ max_iter = p.get("n_estimators", 400),
55
+ learning_rate = p.get("learning_rate", 0.03),
56
+ max_depth = p.get("max_depth", 5),
57
+ min_samples_leaf = p.get("min_samples_leaf", 40),
58
+ l2_regularization = p.get("l2_regularization", 2.0),
59
+ max_features = p.get("max_features", 0.70),
60
+ early_stopping = True,
61
+ validation_fraction = p.get("validation_fraction", 0.15),
62
+ n_iter_no_change = p.get("n_iter_no_change", 30),
63
+ random_state = p.get("random_state", 42),
64
+ verbose = 0,
65
+ )
66
+
67
+
68
+ class ModelBackend:
69
+ """
70
+ Unified classifier. After fit():
71
+ .predict_proba(X) β†’ (N, 2) array
72
+ .predict_win_prob(X) β†’ (N,) array of P(win)
73
+ .feature_importances_ β†’ (n_features,) normalized importances
74
+ .n_iter_ β†’ actual boosting rounds used
75
+ """
76
+
77
+ def __init__(self, params: dict, calibrate: bool = True):
78
+ self.params = params
79
+ self.calibrate = calibrate
80
+ self._base = None
81
+ self._model = None
82
+ self.feature_importances_: np.ndarray = np.array([])
83
+ self.n_iter_: int = 0
84
+ self._backend_name = "lightgbm" if (USE_LIGHTGBM and _LGBM_AVAILABLE) else "hgbm"
85
+
86
+ @property
87
+ def backend_name(self) -> str:
88
+ return self._backend_name
89
+
90
+ def fit(
91
+ self,
92
+ X_train: np.ndarray,
93
+ y_train: np.ndarray,
94
+ X_val: np.ndarray = None,
95
+ y_val: np.ndarray = None,
96
+ sample_weight: np.ndarray = None,
97
+ ) -> "ModelBackend":
98
+ sw = sample_weight
99
+
100
+ if self._backend_name == "lightgbm":
101
+ self._base = _build_lgbm(self.params)
102
+ kw = {}
103
+ if X_val is not None:
104
+ kw["eval_set"] = [(X_val, y_val)]
105
+ if sw is not None:
106
+ kw["sample_weight"] = sw
107
+ self._base.fit(X_train, y_train, **kw)
108
+ self.n_iter_ = int(getattr(self._base, "best_iteration_", 0))
109
+ else:
110
+ self._base = _build_hgbm(self.params)
111
+ kw = {}
112
+ if sw is not None:
113
+ kw["sample_weight"] = sw
114
+ self._base.fit(X_train, y_train, **kw)
115
+ self.n_iter_ = int(getattr(self._base, "n_iter_", self.params.get("n_estimators", 400)))
116
+
117
+ # Isotonic calibration on val set (improves probability reliability)
118
+ if (self.calibrate and X_val is not None and
119
+ len(X_val) >= 50 and len(np.unique(y_val)) == 2):
120
+ cal = CalibratedClassifierCV(self._base, method="isotonic", cv=5)
121
+ cal.fit(X_val, y_val)
122
+ self._model = cal
123
+ else:
124
+ self._model = self._base
125
+
126
+ # Feature importances
127
+ self._compute_importances(X_val, y_val)
128
+ return self
129
+
130
+ def _compute_importances(self, X_val: np.ndarray = None, y_val: np.ndarray = None):
131
+ base = self._base
132
+ if base is None:
133
+ return
134
+
135
+ # LightGBM exposes feature_importances_ directly
136
+ if hasattr(base, "feature_importances_"):
137
+ imp = np.array(base.feature_importances_, dtype=np.float64)
138
+ # HGBM: use permutation importance on val set
139
+ elif X_val is not None and len(X_val) >= 20:
140
+ result = permutation_importance(
141
+ base, X_val, y_val,
142
+ n_repeats=5,
143
+ random_state=42,
144
+ n_jobs=-1,
145
+ )
146
+ imp = np.maximum(result.importances_mean, 0.0)
147
+ else:
148
+ # Fallback: uniform importances
149
+ n_feat = getattr(base, "n_features_in_", 1)
150
+ imp = np.ones(n_feat, dtype=np.float64)
151
+
152
+ # Normalize to sum to 1
153
+ total = imp.sum()
154
+ self.feature_importances_ = imp / total if total > 0 else imp
155
+
156
+ def predict_proba(self, X: np.ndarray) -> np.ndarray:
157
+ if self._model is None:
158
+ raise RuntimeError("Call .fit() before .predict_proba().")
159
+ return self._model.predict_proba(X)
160
+
161
+ def predict_win_prob(self, X: np.ndarray) -> np.ndarray:
162
+ """Return 1-D array of P(win) for each row."""
163
+ return self.predict_proba(X)[:, 1]