Create feature_builder.py
Browse files- feature_builder.py +128 -0
feature_builder.py
ADDED
|
@@ -0,0 +1,128 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
"""
|
| 2 |
+
feature_builder.py — Converts raw rule-engine output dicts into a clean
|
| 3 |
+
feature vector for the ML model. Single responsibility: no model logic here.
|
| 4 |
+
|
| 5 |
+
Design decisions:
|
| 6 |
+
- All bool features cast to int (0/1) — LGBM handles natively but this
|
| 7 |
+
keeps the matrix dtype homogeneous.
|
| 8 |
+
- Engineered interaction terms computed here, not in regime/volume modules,
|
| 9 |
+
to keep those modules free of ML concerns.
|
| 10 |
+
- Returns a dict (for inference) or DataFrame row (for training).
|
| 11 |
+
- FEATURE_COLUMNS from ml_config defines the canonical order — any missing
|
| 12 |
+
feature raises KeyError immediately rather than silently producing NaN.
|
| 13 |
+
"""
|
| 14 |
+
|
| 15 |
+
import math
|
| 16 |
+
from typing import Dict, Any
|
| 17 |
+
|
| 18 |
+
import numpy as np
|
| 19 |
+
import pandas as pd
|
| 20 |
+
|
| 21 |
+
from ml_config import FEATURE_COLUMNS
|
| 22 |
+
|
| 23 |
+
|
| 24 |
+
def build_feature_dict(
|
| 25 |
+
regime_data: Dict[str, Any],
|
| 26 |
+
volume_data: Dict[str, Any],
|
| 27 |
+
scores: Dict[str, Any],
|
| 28 |
+
) -> Dict[str, float]:
|
| 29 |
+
"""
|
| 30 |
+
Build the canonical feature dict from rule-engine outputs.
|
| 31 |
+
All values are Python floats or ints — no pandas/numpy scalars.
|
| 32 |
+
"""
|
| 33 |
+
adx = float(regime_data.get("adx", 0.0))
|
| 34 |
+
di_plus = float(regime_data.get("di_plus", 0.0))
|
| 35 |
+
di_minus = float(regime_data.get("di_minus", 0.0))
|
| 36 |
+
di_sum = di_plus + di_minus + 1e-9
|
| 37 |
+
di_diff = di_plus - di_minus
|
| 38 |
+
di_ratio = di_plus / di_sum
|
| 39 |
+
|
| 40 |
+
atr_pct = float(regime_data.get("atr_pct", 0.0))
|
| 41 |
+
vol_ratio = float(regime_data.get("vol_ratio", 1.0))
|
| 42 |
+
vol_compressed = int(bool(regime_data.get("vol_compressed", False)))
|
| 43 |
+
vol_expanding = int(bool(regime_data.get("vol_expanding", False)))
|
| 44 |
+
vol_expanding_from_base = int(bool(regime_data.get("vol_expanding_from_base", False)))
|
| 45 |
+
|
| 46 |
+
absorption = int(bool(volume_data.get("absorption", False)))
|
| 47 |
+
failed_breakout = int(bool(volume_data.get("failed_breakout", False)))
|
| 48 |
+
recent_failed_count = int(volume_data.get("recent_failed_count", 0))
|
| 49 |
+
obv_slope_norm = float(volume_data.get("obv_slope_norm", 0.0))
|
| 50 |
+
delta_sign = int(volume_data.get("delta_sign", 0))
|
| 51 |
+
spike = int(bool(volume_data.get("spike", False)))
|
| 52 |
+
climax = int(bool(volume_data.get("climax", False)))
|
| 53 |
+
|
| 54 |
+
dist_atr = float(regime_data.get("dist_atr", 0.0))
|
| 55 |
+
dist_atr_abs = abs(dist_atr)
|
| 56 |
+
|
| 57 |
+
regime_confidence = float(regime_data.get("regime_confidence", 0.0))
|
| 58 |
+
regime_score = float(scores.get("regime_score", 0.0))
|
| 59 |
+
volume_score = float(scores.get("volume_score", 0.0))
|
| 60 |
+
structure_score = float(scores.get("structure_score", 0.0))
|
| 61 |
+
confidence_score = float(scores.get("confidence_score", 0.0))
|
| 62 |
+
total_score = float(scores.get("total_score", 0.0))
|
| 63 |
+
|
| 64 |
+
# Interaction terms — multiplicative combinations reduce model depth needed
|
| 65 |
+
adx_x_regime = adx * regime_score
|
| 66 |
+
vol_x_obv = vol_ratio * obv_slope_norm
|
| 67 |
+
score_x_conf = total_score * regime_confidence
|
| 68 |
+
|
| 69 |
+
raw = {
|
| 70 |
+
"adx": adx,
|
| 71 |
+
"di_plus": di_plus,
|
| 72 |
+
"di_minus": di_minus,
|
| 73 |
+
"di_diff": di_diff,
|
| 74 |
+
"di_ratio": di_ratio,
|
| 75 |
+
"atr_pct": atr_pct,
|
| 76 |
+
"vol_ratio": vol_ratio,
|
| 77 |
+
"vol_compressed": vol_compressed,
|
| 78 |
+
"vol_expanding": vol_expanding,
|
| 79 |
+
"vol_expanding_from_base": vol_expanding_from_base,
|
| 80 |
+
"absorption": absorption,
|
| 81 |
+
"failed_breakout": failed_breakout,
|
| 82 |
+
"recent_failed_count": recent_failed_count,
|
| 83 |
+
"obv_slope_norm": obv_slope_norm,
|
| 84 |
+
"delta_sign": delta_sign,
|
| 85 |
+
"spike": spike,
|
| 86 |
+
"climax": climax,
|
| 87 |
+
"dist_atr": dist_atr,
|
| 88 |
+
"dist_atr_abs": dist_atr_abs,
|
| 89 |
+
"regime_confidence": regime_confidence,
|
| 90 |
+
"regime_score": regime_score,
|
| 91 |
+
"volume_score": volume_score,
|
| 92 |
+
"structure_score": structure_score,
|
| 93 |
+
"confidence_score": confidence_score,
|
| 94 |
+
"total_score": total_score,
|
| 95 |
+
"adx_x_regime": adx_x_regime,
|
| 96 |
+
"vol_x_obv": vol_x_obv,
|
| 97 |
+
"score_x_conf": score_x_conf,
|
| 98 |
+
}
|
| 99 |
+
|
| 100 |
+
# Validate all expected columns are present
|
| 101 |
+
missing = set(FEATURE_COLUMNS) - set(raw.keys())
|
| 102 |
+
if missing:
|
| 103 |
+
raise KeyError(f"Missing features: {missing}")
|
| 104 |
+
|
| 105 |
+
# Return in canonical column order
|
| 106 |
+
return {k: raw[k] for k in FEATURE_COLUMNS}
|
| 107 |
+
|
| 108 |
+
|
| 109 |
+
def feature_dict_to_row(feat: Dict[str, float]) -> pd.Series:
|
| 110 |
+
"""Convert feature dict to a pandas Series with canonical column order."""
|
| 111 |
+
return pd.Series({k: feat[k] for k in FEATURE_COLUMNS})
|
| 112 |
+
|
| 113 |
+
|
| 114 |
+
def feature_dict_to_matrix(feat: Dict[str, float]) -> np.ndarray:
|
| 115 |
+
"""
|
| 116 |
+
Convert single feature dict to (1, n_features) numpy array for inference.
|
| 117 |
+
Preserves canonical column order from FEATURE_COLUMNS.
|
| 118 |
+
"""
|
| 119 |
+
return np.array([[feat[k] for k in FEATURE_COLUMNS]], dtype=np.float64)
|
| 120 |
+
|
| 121 |
+
|
| 122 |
+
def validate_features(feat: Dict[str, float]) -> bool:
|
| 123 |
+
"""Return True if all features are finite and present."""
|
| 124 |
+
for k in FEATURE_COLUMNS:
|
| 125 |
+
v = feat.get(k)
|
| 126 |
+
if v is None or (isinstance(v, float) and not math.isfinite(v)):
|
| 127 |
+
return False
|
| 128 |
+
return True
|