Spaces:

GoshawkVortexAI
/

Goshawk_Hedge_Pro

Sleeping

App Files Files Community

GoshawkVortexAI commited on Feb 27

Commit

f952974

verified ·

1 Parent(s): 47584e0

Create feature_builder.py

Browse files

Files changed (1) hide show

feature_builder.py +128 -0

feature_builder.py ADDED Viewed

	@@ -0,0 +1,128 @@

+"""
+feature_builder.py — Converts raw rule-engine output dicts into a clean
+feature vector for the ML model. Single responsibility: no model logic here.
+Design decisions:
+- All bool features cast to int (0/1) — LGBM handles natively but this
+  keeps the matrix dtype homogeneous.
+- Engineered interaction terms computed here, not in regime/volume modules,
+  to keep those modules free of ML concerns.
+- Returns a dict (for inference) or DataFrame row (for training).
+- FEATURE_COLUMNS from ml_config defines the canonical order — any missing
+  feature raises KeyError immediately rather than silently producing NaN.
+"""
+import math
+from typing import Dict, Any
+import numpy as np
+import pandas as pd
+from ml_config import FEATURE_COLUMNS
+def build_feature_dict(
+    regime_data: Dict[str, Any],
+    volume_data: Dict[str, Any],
+    scores: Dict[str, Any],
+) -> Dict[str, float]:
+    """
+    Build the canonical feature dict from rule-engine outputs.
+    All values are Python floats or ints — no pandas/numpy scalars.
+    """
+    adx       = float(regime_data.get("adx", 0.0))
+    di_plus   = float(regime_data.get("di_plus", 0.0))
+    di_minus  = float(regime_data.get("di_minus", 0.0))
+    di_sum    = di_plus + di_minus + 1e-9
+    di_diff   = di_plus - di_minus
+    di_ratio  = di_plus / di_sum
+    atr_pct               = float(regime_data.get("atr_pct", 0.0))
+    vol_ratio             = float(regime_data.get("vol_ratio", 1.0))
+    vol_compressed        = int(bool(regime_data.get("vol_compressed", False)))
+    vol_expanding         = int(bool(regime_data.get("vol_expanding", False)))
+    vol_expanding_from_base = int(bool(regime_data.get("vol_expanding_from_base", False)))
+    absorption          = int(bool(volume_data.get("absorption", False)))
+    failed_breakout     = int(bool(volume_data.get("failed_breakout", False)))
+    recent_failed_count = int(volume_data.get("recent_failed_count", 0))
+    obv_slope_norm      = float(volume_data.get("obv_slope_norm", 0.0))
+    delta_sign          = int(volume_data.get("delta_sign", 0))
+    spike               = int(bool(volume_data.get("spike", False)))
+    climax              = int(bool(volume_data.get("climax", False)))
+    dist_atr     = float(regime_data.get("dist_atr", 0.0))
+    dist_atr_abs = abs(dist_atr)
+    regime_confidence = float(regime_data.get("regime_confidence", 0.0))
+    regime_score      = float(scores.get("regime_score", 0.0))
+    volume_score      = float(scores.get("volume_score", 0.0))
+    structure_score   = float(scores.get("structure_score", 0.0))
+    confidence_score  = float(scores.get("confidence_score", 0.0))
+    total_score       = float(scores.get("total_score", 0.0))
+    # Interaction terms — multiplicative combinations reduce model depth needed
+    adx_x_regime = adx * regime_score
+    vol_x_obv    = vol_ratio * obv_slope_norm
+    score_x_conf = total_score * regime_confidence
+    raw = {
+        "adx":                    adx,
+        "di_plus":                di_plus,
+        "di_minus":               di_minus,
+        "di_diff":                di_diff,
+        "di_ratio":               di_ratio,
+        "atr_pct":                atr_pct,
+        "vol_ratio":              vol_ratio,
+        "vol_compressed":         vol_compressed,
+        "vol_expanding":          vol_expanding,
+        "vol_expanding_from_base": vol_expanding_from_base,
+        "absorption":             absorption,
+        "failed_breakout":        failed_breakout,
+        "recent_failed_count":    recent_failed_count,
+        "obv_slope_norm":         obv_slope_norm,
+        "delta_sign":             delta_sign,
+        "spike":                  spike,
+        "climax":                 climax,
+        "dist_atr":               dist_atr,
+        "dist_atr_abs":           dist_atr_abs,
+        "regime_confidence":      regime_confidence,
+        "regime_score":           regime_score,
+        "volume_score":           volume_score,
+        "structure_score":        structure_score,
+        "confidence_score":       confidence_score,
+        "total_score":            total_score,
+        "adx_x_regime":           adx_x_regime,
+        "vol_x_obv":              vol_x_obv,
+        "score_x_conf":           score_x_conf,
+    }
+    # Validate all expected columns are present
+    missing = set(FEATURE_COLUMNS) - set(raw.keys())
+    if missing:
+        raise KeyError(f"Missing features: {missing}")
+    # Return in canonical column order
+    return {k: raw[k] for k in FEATURE_COLUMNS}
+def feature_dict_to_row(feat: Dict[str, float]) -> pd.Series:
+    """Convert feature dict to a pandas Series with canonical column order."""
+    return pd.Series({k: feat[k] for k in FEATURE_COLUMNS})
+def feature_dict_to_matrix(feat: Dict[str, float]) -> np.ndarray:
+    """
+    Convert single feature dict to (1, n_features) numpy array for inference.
+    Preserves canonical column order from FEATURE_COLUMNS.
+    """
+    return np.array([[feat[k] for k in FEATURE_COLUMNS]], dtype=np.float64)
+def validate_features(feat: Dict[str, float]) -> bool:
+    """Return True if all features are finite and present."""
+    for k in FEATURE_COLUMNS:
+        v = feat.get(k)
+        if v is None or (isinstance(v, float) and not math.isfinite(v)):
+            return False
+    return True