GoshawkVortexAI commited on
Commit
f952974
·
verified ·
1 Parent(s): 47584e0

Create feature_builder.py

Browse files
Files changed (1) hide show
  1. feature_builder.py +128 -0
feature_builder.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ feature_builder.py — Converts raw rule-engine output dicts into a clean
3
+ feature vector for the ML model. Single responsibility: no model logic here.
4
+
5
+ Design decisions:
6
+ - All bool features cast to int (0/1) — LGBM handles natively but this
7
+ keeps the matrix dtype homogeneous.
8
+ - Engineered interaction terms computed here, not in regime/volume modules,
9
+ to keep those modules free of ML concerns.
10
+ - Returns a dict (for inference) or DataFrame row (for training).
11
+ - FEATURE_COLUMNS from ml_config defines the canonical order — any missing
12
+ feature raises KeyError immediately rather than silently producing NaN.
13
+ """
14
+
15
+ import math
16
+ from typing import Dict, Any
17
+
18
+ import numpy as np
19
+ import pandas as pd
20
+
21
+ from ml_config import FEATURE_COLUMNS
22
+
23
+
24
+ def build_feature_dict(
25
+ regime_data: Dict[str, Any],
26
+ volume_data: Dict[str, Any],
27
+ scores: Dict[str, Any],
28
+ ) -> Dict[str, float]:
29
+ """
30
+ Build the canonical feature dict from rule-engine outputs.
31
+ All values are Python floats or ints — no pandas/numpy scalars.
32
+ """
33
+ adx = float(regime_data.get("adx", 0.0))
34
+ di_plus = float(regime_data.get("di_plus", 0.0))
35
+ di_minus = float(regime_data.get("di_minus", 0.0))
36
+ di_sum = di_plus + di_minus + 1e-9
37
+ di_diff = di_plus - di_minus
38
+ di_ratio = di_plus / di_sum
39
+
40
+ atr_pct = float(regime_data.get("atr_pct", 0.0))
41
+ vol_ratio = float(regime_data.get("vol_ratio", 1.0))
42
+ vol_compressed = int(bool(regime_data.get("vol_compressed", False)))
43
+ vol_expanding = int(bool(regime_data.get("vol_expanding", False)))
44
+ vol_expanding_from_base = int(bool(regime_data.get("vol_expanding_from_base", False)))
45
+
46
+ absorption = int(bool(volume_data.get("absorption", False)))
47
+ failed_breakout = int(bool(volume_data.get("failed_breakout", False)))
48
+ recent_failed_count = int(volume_data.get("recent_failed_count", 0))
49
+ obv_slope_norm = float(volume_data.get("obv_slope_norm", 0.0))
50
+ delta_sign = int(volume_data.get("delta_sign", 0))
51
+ spike = int(bool(volume_data.get("spike", False)))
52
+ climax = int(bool(volume_data.get("climax", False)))
53
+
54
+ dist_atr = float(regime_data.get("dist_atr", 0.0))
55
+ dist_atr_abs = abs(dist_atr)
56
+
57
+ regime_confidence = float(regime_data.get("regime_confidence", 0.0))
58
+ regime_score = float(scores.get("regime_score", 0.0))
59
+ volume_score = float(scores.get("volume_score", 0.0))
60
+ structure_score = float(scores.get("structure_score", 0.0))
61
+ confidence_score = float(scores.get("confidence_score", 0.0))
62
+ total_score = float(scores.get("total_score", 0.0))
63
+
64
+ # Interaction terms — multiplicative combinations reduce model depth needed
65
+ adx_x_regime = adx * regime_score
66
+ vol_x_obv = vol_ratio * obv_slope_norm
67
+ score_x_conf = total_score * regime_confidence
68
+
69
+ raw = {
70
+ "adx": adx,
71
+ "di_plus": di_plus,
72
+ "di_minus": di_minus,
73
+ "di_diff": di_diff,
74
+ "di_ratio": di_ratio,
75
+ "atr_pct": atr_pct,
76
+ "vol_ratio": vol_ratio,
77
+ "vol_compressed": vol_compressed,
78
+ "vol_expanding": vol_expanding,
79
+ "vol_expanding_from_base": vol_expanding_from_base,
80
+ "absorption": absorption,
81
+ "failed_breakout": failed_breakout,
82
+ "recent_failed_count": recent_failed_count,
83
+ "obv_slope_norm": obv_slope_norm,
84
+ "delta_sign": delta_sign,
85
+ "spike": spike,
86
+ "climax": climax,
87
+ "dist_atr": dist_atr,
88
+ "dist_atr_abs": dist_atr_abs,
89
+ "regime_confidence": regime_confidence,
90
+ "regime_score": regime_score,
91
+ "volume_score": volume_score,
92
+ "structure_score": structure_score,
93
+ "confidence_score": confidence_score,
94
+ "total_score": total_score,
95
+ "adx_x_regime": adx_x_regime,
96
+ "vol_x_obv": vol_x_obv,
97
+ "score_x_conf": score_x_conf,
98
+ }
99
+
100
+ # Validate all expected columns are present
101
+ missing = set(FEATURE_COLUMNS) - set(raw.keys())
102
+ if missing:
103
+ raise KeyError(f"Missing features: {missing}")
104
+
105
+ # Return in canonical column order
106
+ return {k: raw[k] for k in FEATURE_COLUMNS}
107
+
108
+
109
+ def feature_dict_to_row(feat: Dict[str, float]) -> pd.Series:
110
+ """Convert feature dict to a pandas Series with canonical column order."""
111
+ return pd.Series({k: feat[k] for k in FEATURE_COLUMNS})
112
+
113
+
114
+ def feature_dict_to_matrix(feat: Dict[str, float]) -> np.ndarray:
115
+ """
116
+ Convert single feature dict to (1, n_features) numpy array for inference.
117
+ Preserves canonical column order from FEATURE_COLUMNS.
118
+ """
119
+ return np.array([[feat[k] for k in FEATURE_COLUMNS]], dtype=np.float64)
120
+
121
+
122
+ def validate_features(feat: Dict[str, float]) -> bool:
123
+ """Return True if all features are finite and present."""
124
+ for k in FEATURE_COLUMNS:
125
+ v = feat.get(k)
126
+ if v is None or (isinstance(v, float) and not math.isfinite(v)):
127
+ return False
128
+ return True