aurora-brain / feature_engine.py
redradios's picture
v2.0: SMC features + fix missing columns for BTC prediction
c7e6a9e
"""
╔══════════════════════════════════════════════════════════════╗
║ AURORA BRAIN — Feature Engine ║
║ ║
║ Genera 200+ features por vela a partir de datos OHLCV + ║
║ derivados + macro + sentimiento. ║
║ Usa librería `ta` (technical analysis) compatible con ║
║ pandas 2.x en HuggingFace. ║
╚══════════════════════════════════════════════════════════════╝
"""
import os
import argparse
import logging
import warnings
import numpy as np
import pandas as pd
import ta as talib
warnings.filterwarnings("ignore", category=FutureWarning)
logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s")
logger = logging.getLogger("AuroraBrain.Features")
DATA_DIR = os.path.join(os.path.dirname(__file__), "data")
# ═══════════════════════════════════════════════════════════
# A. MICROESTRUCTURA DE PRECIO (~50 features)
# ═══════════════════════════════════════════════════════════
def features_microestructura(df: pd.DataFrame) -> pd.DataFrame:
o, h, l, c = df["open"], df["high"], df["low"], df["close"]
rng = (h - l).replace(0, np.nan)
df["f_body_ratio"] = (c - o).abs() / rng
df["f_upper_wick_ratio"] = (h - pd.concat([c, o], axis=1).max(axis=1)) / rng
df["f_lower_wick_ratio"] = (pd.concat([c, o], axis=1).min(axis=1) - l) / rng
df["f_price_position"] = (c - l) / rng
df["f_is_bull"] = (c > o).astype(int)
df["f_is_doji"] = (df["f_body_ratio"] < 0.1).astype(int)
atr14 = talib.volatility.AverageTrueRange(h, l, c, window=14).average_true_range()
df["f_body_atr_ratio"] = (c - o).abs() / atr14.replace(0, np.nan)
df["f_gap_pct"] = (o - c.shift(1)) / c.shift(1) * 100
bull = (c > o).astype(int)
df["f_consec_bull"] = bull.groupby((bull != bull.shift()).cumsum()).cumcount() + 1
df["f_consec_bull"] = df["f_consec_bull"] * bull
bear = (c < o).astype(int)
df["f_consec_bear"] = bear.groupby((bear != bear.shift()).cumsum()).cumcount() + 1
df["f_consec_bear"] = df["f_consec_bear"] * bear
df["f_engulfing_bull"] = ((c > o) & (c.shift(1) < o.shift(1)) &
(c > o.shift(1)) & (o < c.shift(1))).astype(int)
df["f_engulfing_bear"] = ((c < o) & (c.shift(1) > o.shift(1)) &
(c < o.shift(1)) & (o > c.shift(1))).astype(int)
df["f_hammer"] = ((df["f_lower_wick_ratio"] > 0.6) & (df["f_body_ratio"] < 0.3) &
(df["f_upper_wick_ratio"] < 0.1)).astype(int)
df["f_shooting_star"] = ((df["f_upper_wick_ratio"] > 0.6) & (df["f_body_ratio"] < 0.3) &
(df["f_lower_wick_ratio"] < 0.1)).astype(int)
df["f_atr_14"] = atr14
df["f_atr_pct"] = atr14 / c * 100
df["f_atr_roc_5"] = atr14.pct_change(5) * 100
df["f_atr_roc_14"] = atr14.pct_change(14) * 100
df["f_range_ratio_20"] = rng / rng.rolling(20).mean()
for period in [1, 3, 5, 10, 20]:
df[f"f_return_{period}"] = c.pct_change(period) * 100
df["f_hl_pct"] = rng / c * 100
df["f_dist_high_20"] = (c - h.rolling(20).max()) / c * 100
df["f_dist_low_20"] = (c - l.rolling(20).min()) / c * 100
pattern = pd.Series(0, index=df.index, dtype=int)
for i in range(5):
pattern += bull.shift(i).fillna(0).astype(int) * (2 ** i)
df["f_candle_pattern_5"] = pattern
logger.info(" ✅ A. Microestructura generada")
return df
# ═══════════════════════════════════════════════════════════
# B. MOMENTUM Y TENDENCIA (~40 features)
# ═══════════════════════════════════════════════════════════
def features_momentum(df: pd.DataFrame) -> pd.DataFrame:
c, h, l = df["close"], df["high"], df["low"]
# RSI
df["f_rsi_14"] = talib.momentum.RSIIndicator(c, window=14).rsi()
df["f_rsi_7"] = talib.momentum.RSIIndicator(c, window=7).rsi()
df["f_rsi_roc_5"] = df["f_rsi_14"].diff(5)
df["f_rsi_roc_14"] = df["f_rsi_14"].diff(14)
df["f_rsi_price_div_14"] = df.get("f_return_14", c.pct_change(14)*100) - df["f_rsi_roc_14"]
# MACD
macd_ind = talib.trend.MACD(c, window_slow=26, window_fast=12, window_sign=9)
df["f_macd_hist"] = macd_ind.macd_diff()
df["f_macd_hist_roc"] = df["f_macd_hist"].diff(3)
df["f_macd_cross_bull"] = ((df["f_macd_hist"] > 0) & (df["f_macd_hist"].shift(1) <= 0)).astype(int)
df["f_macd_cross_bear"] = ((df["f_macd_hist"] < 0) & (df["f_macd_hist"].shift(1) >= 0)).astype(int)
# ADX
adx_ind = talib.trend.ADXIndicator(h, l, c, window=14)
df["f_adx_14"] = adx_ind.adx()
df["f_plus_di"] = adx_ind.adx_pos()
df["f_minus_di"] = adx_ind.adx_neg()
df["f_di_ratio"] = df["f_plus_di"] / df["f_minus_di"].replace(0, np.nan)
adx_28 = talib.trend.ADXIndicator(h, l, c, window=28)
df["f_adx_28"] = adx_28.adx()
# EMAs
for length in [9, 21, 55, 200]:
ema = talib.trend.EMAIndicator(c, window=length).ema_indicator()
df[f"f_ema_{length}"] = ema
df[f"f_dist_ema_{length}"] = (c - ema) / ema * 100
df[f"f_ema_{length}_slope"] = ema.diff(5) / ema * 100
df["f_ema_order_bull"] = (
(df["f_ema_9"] > df["f_ema_21"]) &
(df["f_ema_21"] > df["f_ema_55"]) &
(df["f_ema_55"] > df["f_ema_200"])
).astype(int)
df["f_ema_order_bear"] = (
(df["f_ema_9"] < df["f_ema_21"]) &
(df["f_ema_21"] < df["f_ema_55"]) &
(df["f_ema_55"] < df["f_ema_200"])
).astype(int)
# Bollinger Bands
bb = talib.volatility.BollingerBands(c, window=20, window_dev=2)
df["f_bb_upper"] = bb.bollinger_hband()
df["f_bb_mid"] = bb.bollinger_mavg()
df["f_bb_lower"] = bb.bollinger_lband()
df["f_bb_width"] = (df["f_bb_upper"] - df["f_bb_lower"]) / df["f_bb_mid"] * 100
df["f_bb_position"] = (c - df["f_bb_lower"]) / (df["f_bb_upper"] - df["f_bb_lower"]).replace(0, np.nan)
# Stochastic
stoch = talib.momentum.StochasticOscillator(h, l, c, window=14, smooth_window=3)
df["f_stoch_k"] = stoch.stoch()
df["f_stoch_d"] = stoch.stoch_signal()
# HH/HL/LH/LL counts
swing_h = h.rolling(10).max()
swing_l = l.rolling(10).min()
prev_sh = swing_h.shift(10)
prev_sl = swing_l.shift(10)
df["f_hh_count_20"] = (swing_h > prev_sh).rolling(20).sum().fillna(0)
df["f_hl_count_20"] = (swing_l > prev_sl).rolling(20).sum().fillna(0)
df["f_ll_count_20"] = (swing_l < prev_sl).rolling(20).sum().fillna(0)
df["f_lh_count_20"] = (swing_h < prev_sh).rolling(20).sum().fillna(0)
logger.info(" ✅ B. Momentum generado")
return df
# ═══════════════════════════════════════════════════════════
# C. VOLUMEN Y FLUJO (~30 features)
# ═══════════════════════════════════════════════════════════
def features_volumen(df: pd.DataFrame) -> pd.DataFrame:
v = df["volume"]
c = df["close"]
qv = df.get("quote_volume", v * c)
tbr = df.get("taker_buy_ratio", pd.Series(0.5, index=df.index))
df["f_vol_sma_20"] = v.rolling(20).mean()
df["f_vol_ratio_20"] = v / df["f_vol_sma_20"].replace(0, np.nan)
df["f_vol_ratio_5"] = v / v.rolling(5).mean().replace(0, np.nan)
df["f_vol_accel"] = df["f_vol_ratio_20"].diff(3)
df["f_vol_spike"] = (df["f_vol_ratio_20"] > 2.0).astype(int)
obv = talib.volume.OnBalanceVolumeIndicator(c, v).on_balance_volume()
df["f_obv"] = obv
df["f_obv_slope_10"] = obv.diff(10) / obv.abs().replace(0, np.nan) * 100
vwap_approx = qv.rolling(20).sum() / v.rolling(20).sum().replace(0, np.nan)
df["f_vwap_dev"] = (c - vwap_approx) / vwap_approx * 100
df["f_tbr"] = tbr
df["f_tbr_sma_10"] = tbr.rolling(10).mean()
df["f_tbr_roc_5"] = tbr.diff(5)
df["f_qvol_roc_5"] = qv.pct_change(5) * 100
if "trades" in df.columns:
trades = df["trades"]
df["f_trades_ratio_20"] = trades / trades.rolling(20).mean().replace(0, np.nan)
df["f_avg_trade_size"] = qv / trades.replace(0, np.nan)
mid_price = (df["high"] + df["low"]) / 2
df["f_vol_above_mid"] = ((c > mid_price) * v).rolling(20).sum()
df["f_vol_below_mid"] = ((c <= mid_price) * v).rolling(20).sum()
df["f_vol_balance"] = df["f_vol_above_mid"] / (df["f_vol_above_mid"] + df["f_vol_below_mid"]).replace(0, np.nan)
logger.info(" ✅ C. Volumen generado")
return df
# ═══════════════════════════════════════════════════════════
# D. CROSS-ASSET INTELLIGENCE (~30 features)
# ═══════════════════════════════════════════════════════════
def features_cross_asset(df: pd.DataFrame, df_btc: pd.DataFrame = None,
df_macro: pd.DataFrame = None) -> pd.DataFrame:
c = df["close"]
symbol = df["symbol"].iloc[0] if "symbol" in df.columns else "UNKNOWN"
if df_btc is not None and symbol != "BTCUSDT":
btc_c = df_btc["close"].reindex(df.index, method="ffill")
df["f_corr_btc_30"] = c.rolling(30).corr(btc_c)
df["f_corr_btc_90"] = c.rolling(90).corr(btc_c)
btc_ret = btc_c.pct_change()
sym_ret = c.pct_change()
cov = sym_ret.rolling(30).cov(btc_ret)
var = btc_ret.rolling(30).var()
df["f_beta_btc_30"] = cov / var.replace(0, np.nan)
for lag in [1, 3, 6]:
df[f"f_lead_btc_{lag}"] = sym_ret.shift(lag).rolling(10).corr(btc_ret)
df["f_spread_btc"] = (c / btc_c).pct_change(5) * 100
else:
df["f_corr_btc_30"] = 1.0
df["f_corr_btc_90"] = 1.0
if df_macro is not None and not df_macro.empty:
for col in df_macro.columns:
macro_series = df_macro[col].reindex(df.index, method="ffill")
df[f"f_{col}"] = macro_series
df[f"f_{col}_roc_5d"] = macro_series.pct_change(5) * 100
logger.info(" ✅ D. Cross-asset generado")
return df
# ═══════════════════════════════════════════════════════════
# E. ON-CHAIN Y DERIVADOS (~30 features)
# ═══════════════════════════════════════════════════════════
def features_onchain(df: pd.DataFrame, df_funding: pd.DataFrame = None,
df_ls: pd.DataFrame = None, df_oi: pd.DataFrame = None) -> pd.DataFrame:
if df_funding is not None and not df_funding.empty:
fr = df_funding["fundingRate"].reindex(df.index, method="ffill")
df["f_funding_rate"] = fr
df["f_funding_rate_sma_10"] = fr.rolling(10).mean()
df["f_funding_rate_extreme_pos"] = (fr > 0.001).astype(int)
df["f_funding_rate_extreme_neg"] = (fr < -0.001).astype(int)
df["f_funding_rate_roc"] = fr.diff(3)
if df_ls is not None and not df_ls.empty:
ls = df_ls["longShortRatio"].reindex(df.index, method="ffill")
df["f_ls_ratio"] = ls
df["f_ls_ratio_sma_10"] = ls.rolling(10).mean()
df["f_ls_ratio_roc"] = ls.pct_change(5) * 100
if "longAccount" in df_ls.columns:
df["f_long_pct"] = df_ls["longAccount"].reindex(df.index, method="ffill")
if df_oi is not None and not df_oi.empty:
oi = df_oi["sumOpenInterestValue"].reindex(df.index, method="ffill")
df["f_oi_value"] = oi
df["f_oi_roc_5"] = oi.pct_change(5) * 100
df["f_oi_roc_24"] = oi.pct_change(24) * 100
if "f_return_5" in df.columns:
df["f_oi_price_div"] = df["f_return_5"] - df["f_oi_roc_5"]
logger.info(" ✅ E. On-chain generado")
return df
# ═══════════════════════════════════════════════════════════
# F. SENTIMIENTO Y MACRO (~20 features)
# ═══════════════════════════════════════════════════════════
def features_sentimiento(df: pd.DataFrame, df_fg: pd.DataFrame = None) -> pd.DataFrame:
if df_fg is not None and not df_fg.empty:
fg = df_fg["fear_greed"].reindex(df.index, method="ffill")
df["f_fear_greed"] = fg
df["f_fear_greed_roc_5"] = fg.diff(5)
df["f_fear_greed_extreme_fear"] = (fg < 25).astype(int)
df["f_fear_greed_extreme_greed"] = (fg > 75).astype(int)
if isinstance(df.index, pd.DatetimeIndex):
df["f_hour_of_day"] = df.index.hour
df["f_day_of_week"] = df.index.dayofweek
df["f_is_weekend"] = (df.index.dayofweek >= 5).astype(int)
df["f_month"] = df.index.month
df["f_is_q4"] = (df.index.month >= 10).astype(int)
last_halving = pd.Timestamp("2024-04-20", tz="UTC")
df["f_days_since_halving"] = (df.index - last_halving).days
logger.info(" ✅ F. Sentimiento generado")
return df
# ═══════════════════════════════════════════════════════════
# ORQUESTADOR PRINCIPAL
# ═══════════════════════════════════════════════════════════
def features_smc(df, swing_len=10, displ_mult=1.5, atr_len=14):
"""SMC features - mismos patrones de los Pine Scripts SMC v9."""
h, l, c, o = df["high"], df["low"], df["close"], df["open"]
atr = talib.volatility.AverageTrueRange(h, l, c, window=atr_len).average_true_range()
sh = h.rolling(swing_len).max()
sl = l.rolling(swing_len).min()
prev_sh = sh.shift(swing_len)
prev_sl = sl.shift(swing_len)
hh = (sh > prev_sh).astype(int)
hl = (sl > prev_sl).astype(int)
lh = (sh < prev_sh).astype(int)
ll_sig = (sl < prev_sl).astype(int)
df["f_smc_hh_20"] = hh.rolling(20).sum()
df["f_smc_hl_20"] = hl.rolling(20).sum()
df["f_smc_lh_20"] = lh.rolling(20).sum()
df["f_smc_ll_20"] = ll_sig.rolling(20).sum()
df["f_smc_bias_score"] = (df["f_smc_hh_20"] + df["f_smc_hl_20"]) - (df["f_smc_lh_20"] + df["f_smc_ll_20"])
df["f_smc_bias_bull"] = (df["f_smc_bias_score"] > 2).astype(int)
df["f_smc_bias_bear"] = (df["f_smc_bias_score"] < -2).astype(int)
body_size = (c - o).abs()
df["f_smc_displacement"] = (body_size > atr * displ_mult).astype(int)
df["f_smc_displacement_bull"] = ((c > o) & (body_size > atr * displ_mult)).astype(int)
df["f_smc_displacement_bear"] = ((c < o) & (body_size > atr * displ_mult)).astype(int)
df["f_smc_body_atr_mult"] = body_size / atr.replace(0, np.nan)
prev_swing_h = h.rolling(swing_len).max().shift(1)
prev_swing_l = l.rolling(swing_len).min().shift(1)
df["f_smc_bos_bull"] = ((c > prev_swing_h) & (o < prev_swing_h)).astype(int)
df["f_smc_bos_bear"] = ((c < prev_swing_l) & (o > prev_swing_l)).astype(int)
df["f_smc_bos_bull_5"] = df["f_smc_bos_bull"].rolling(5).sum()
df["f_smc_bos_bear_5"] = df["f_smc_bos_bear"].rolling(5).sum()
bull_fvg = (h.shift(2) < l).astype(int)
bear_fvg = (l.shift(2) > h).astype(int)
df["f_smc_fvg_bull"] = bull_fvg
df["f_smc_fvg_bear"] = bear_fvg
df["f_smc_fvg_bull_size"] = ((l - h.shift(2)) / c * 100).clip(lower=0)
df["f_smc_fvg_bear_size"] = ((l.shift(2) - h) / c * 100).clip(lower=0)
df["f_smc_fvg_bull_count_10"] = bull_fvg.rolling(10).sum()
df["f_smc_fvg_bear_count_10"] = bear_fvg.rolling(10).sum()
prev_low = l.rolling(swing_len).min().shift(1)
prev_high = h.rolling(swing_len).max().shift(1)
df["f_smc_sweep_bull"] = ((l < prev_low) & (c > prev_low)).astype(int)
df["f_smc_sweep_bear"] = ((h > prev_high) & (c < prev_high)).astype(int)
bull_score = (
df["f_smc_bias_bull"] +
df["f_smc_displacement_bull"].rolling(5).max().fillna(0) +
df["f_smc_bos_bull"].rolling(5).max().fillna(0) +
df["f_smc_fvg_bull"].rolling(3).max().fillna(0) +
df["f_smc_sweep_bull"].rolling(10).max().fillna(0)
)
df["f_smc_confluence_bull"] = bull_score
bear_score = (
df["f_smc_bias_bear"] +
df["f_smc_displacement_bear"].rolling(5).max().fillna(0) +
df["f_smc_bos_bear"].rolling(5).max().fillna(0) +
df["f_smc_fvg_bear"].rolling(3).max().fillna(0) +
df["f_smc_sweep_bear"].rolling(10).max().fillna(0)
)
df["f_smc_confluence_bear"] = bear_score
df["f_smc_confluence_net"] = bull_score - bear_score
df["f_smc_full_setup_bull"] = (bull_score >= 5).astype(int)
df["f_smc_full_setup_bear"] = (bear_score >= 5).astype(int)
logger.info(" G. Smart Money Concepts generado")
return df
def generate_features(symbol: str, timeframe: str = "4h",
btc_symbol: str = "BTCUSDT") -> pd.DataFrame:
logger.info("🔧 Generando features para %s %s...", symbol, timeframe)
klines_path = os.path.join(DATA_DIR, f"klines_{symbol}_{timeframe}.parquet")
if not os.path.exists(klines_path):
logger.error("❌ No se encontró %s", klines_path)
return pd.DataFrame()
df = pd.read_parquet(klines_path)
logger.info(" 📊 %d velas cargadas", len(df))
df_btc = None
if symbol != btc_symbol:
btc_path = os.path.join(DATA_DIR, f"klines_{btc_symbol}_{timeframe}.parquet")
if os.path.exists(btc_path):
df_btc = pd.read_parquet(btc_path)
df_funding = _load_parquet(f"funding_{symbol}.parquet")
df_ls = _load_parquet(f"longshort_{symbol}.parquet")
df_oi = _load_parquet(f"oi_{symbol}.parquet")
df_macro = _load_parquet("macro.parquet")
df_fg = _load_parquet("fear_greed.parquet")
df = features_microestructura(df)
df = features_momentum(df)
df = features_volumen(df)
df = features_cross_asset(df, df_btc=df_btc, df_macro=df_macro)
df = features_onchain(df, df_funding=df_funding, df_ls=df_ls, df_oi=df_oi)
df = features_sentimiento(df, df_fg=df_fg)
df = features_smc(df)
feature_cols = [c for c in df.columns if c.startswith("f_")]
df[feature_cols] = df[feature_cols].replace([np.inf, -np.inf], np.nan)
warmup = 210
df = df.iloc[warmup:]
n_features = len(feature_cols)
nan_pct = df[feature_cols].isna().mean().mean() * 100
logger.info(" 📊 %d velas × %d features (NaN: %.1f%%)", len(df), n_features, nan_pct)
return df
def _load_parquet(filename: str) -> pd.DataFrame:
path = os.path.join(DATA_DIR, filename)
if os.path.exists(path):
return pd.read_parquet(path)
return pd.DataFrame()
def get_feature_columns(df: pd.DataFrame) -> list[str]:
return sorted([c for c in df.columns if c.startswith("f_")])
def main():
parser = argparse.ArgumentParser(description="Aurora Brain — Feature Engine")
parser.add_argument("--symbol", default="BTCUSDT")
parser.add_argument("--timeframe", default="4h")
parser.add_argument("--all", action="store_true")
args = parser.parse_args()
symbols = [
"BTCUSDT", "ETHUSDT", "SOLUSDT",
"LINKUSDT", "TAOUSDT", "WLDUSDT", "VIRTUALUSDT", "FETUSDT",
"INJUSDT", "GRTUSDT", "KITEUSDT", "THETAUSDT",
"KAITOUSDT", "SENTUSDT", "LPTUSDT", "AWEUSDT", "TURBOUSDT",
"SAHARAUSDT", "VANAUSDT", "NMRUSDT", "OPENUSDT", "ROBOUSDT",
"HOLOUSDT", "RLCUSDT", "IOUSDT", "PHAUSDT", "IQUSDT",
"AIXBTUSDT", "SAPIENUSDT", "FLUXUSDT", "ALLOUSDT", "MIRAUSDT",
] if args.all else [args.symbol]
for symbol in symbols:
df = generate_features(symbol, args.timeframe)
if not df.empty:
path = os.path.join(DATA_DIR, f"features_{symbol}_{args.timeframe}.parquet")
df.to_parquet(path)
logger.info("💾 Guardado: %s", path)
if __name__ == "__main__":
main()