""" ╔══════════════════════════════════════════════════════════════╗ ║ AURORA BRAIN — Feature Engine ║ ║ ║ ║ Genera 200+ features por vela a partir de datos OHLCV + ║ ║ derivados + macro + sentimiento. ║ ║ Usa librería `ta` (technical analysis) compatible con ║ ║ pandas 2.x en HuggingFace. ║ ╚══════════════════════════════════════════════════════════════╝ """ import os import argparse import logging import warnings import numpy as np import pandas as pd import ta as talib warnings.filterwarnings("ignore", category=FutureWarning) logging.basicConfig(level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s") logger = logging.getLogger("AuroraBrain.Features") DATA_DIR = os.path.join(os.path.dirname(__file__), "data") # ═══════════════════════════════════════════════════════════ # A. MICROESTRUCTURA DE PRECIO (~50 features) # ═══════════════════════════════════════════════════════════ def features_microestructura(df: pd.DataFrame) -> pd.DataFrame: o, h, l, c = df["open"], df["high"], df["low"], df["close"] rng = (h - l).replace(0, np.nan) df["f_body_ratio"] = (c - o).abs() / rng df["f_upper_wick_ratio"] = (h - pd.concat([c, o], axis=1).max(axis=1)) / rng df["f_lower_wick_ratio"] = (pd.concat([c, o], axis=1).min(axis=1) - l) / rng df["f_price_position"] = (c - l) / rng df["f_is_bull"] = (c > o).astype(int) df["f_is_doji"] = (df["f_body_ratio"] < 0.1).astype(int) atr14 = talib.volatility.AverageTrueRange(h, l, c, window=14).average_true_range() df["f_body_atr_ratio"] = (c - o).abs() / atr14.replace(0, np.nan) df["f_gap_pct"] = (o - c.shift(1)) / c.shift(1) * 100 bull = (c > o).astype(int) df["f_consec_bull"] = bull.groupby((bull != bull.shift()).cumsum()).cumcount() + 1 df["f_consec_bull"] = df["f_consec_bull"] * bull bear = (c < o).astype(int) df["f_consec_bear"] = bear.groupby((bear != bear.shift()).cumsum()).cumcount() + 1 df["f_consec_bear"] = df["f_consec_bear"] * bear df["f_engulfing_bull"] = ((c > o) & (c.shift(1) < o.shift(1)) & (c > o.shift(1)) & (o < c.shift(1))).astype(int) df["f_engulfing_bear"] = ((c < o) & (c.shift(1) > o.shift(1)) & (c < o.shift(1)) & (o > c.shift(1))).astype(int) df["f_hammer"] = ((df["f_lower_wick_ratio"] > 0.6) & (df["f_body_ratio"] < 0.3) & (df["f_upper_wick_ratio"] < 0.1)).astype(int) df["f_shooting_star"] = ((df["f_upper_wick_ratio"] > 0.6) & (df["f_body_ratio"] < 0.3) & (df["f_lower_wick_ratio"] < 0.1)).astype(int) df["f_atr_14"] = atr14 df["f_atr_pct"] = atr14 / c * 100 df["f_atr_roc_5"] = atr14.pct_change(5) * 100 df["f_atr_roc_14"] = atr14.pct_change(14) * 100 df["f_range_ratio_20"] = rng / rng.rolling(20).mean() for period in [1, 3, 5, 10, 20]: df[f"f_return_{period}"] = c.pct_change(period) * 100 df["f_hl_pct"] = rng / c * 100 df["f_dist_high_20"] = (c - h.rolling(20).max()) / c * 100 df["f_dist_low_20"] = (c - l.rolling(20).min()) / c * 100 pattern = pd.Series(0, index=df.index, dtype=int) for i in range(5): pattern += bull.shift(i).fillna(0).astype(int) * (2 ** i) df["f_candle_pattern_5"] = pattern logger.info(" ✅ A. Microestructura generada") return df # ═══════════════════════════════════════════════════════════ # B. MOMENTUM Y TENDENCIA (~40 features) # ═══════════════════════════════════════════════════════════ def features_momentum(df: pd.DataFrame) -> pd.DataFrame: c, h, l = df["close"], df["high"], df["low"] # RSI df["f_rsi_14"] = talib.momentum.RSIIndicator(c, window=14).rsi() df["f_rsi_7"] = talib.momentum.RSIIndicator(c, window=7).rsi() df["f_rsi_roc_5"] = df["f_rsi_14"].diff(5) df["f_rsi_roc_14"] = df["f_rsi_14"].diff(14) df["f_rsi_price_div_14"] = df.get("f_return_14", c.pct_change(14)*100) - df["f_rsi_roc_14"] # MACD macd_ind = talib.trend.MACD(c, window_slow=26, window_fast=12, window_sign=9) df["f_macd_hist"] = macd_ind.macd_diff() df["f_macd_hist_roc"] = df["f_macd_hist"].diff(3) df["f_macd_cross_bull"] = ((df["f_macd_hist"] > 0) & (df["f_macd_hist"].shift(1) <= 0)).astype(int) df["f_macd_cross_bear"] = ((df["f_macd_hist"] < 0) & (df["f_macd_hist"].shift(1) >= 0)).astype(int) # ADX adx_ind = talib.trend.ADXIndicator(h, l, c, window=14) df["f_adx_14"] = adx_ind.adx() df["f_plus_di"] = adx_ind.adx_pos() df["f_minus_di"] = adx_ind.adx_neg() df["f_di_ratio"] = df["f_plus_di"] / df["f_minus_di"].replace(0, np.nan) adx_28 = talib.trend.ADXIndicator(h, l, c, window=28) df["f_adx_28"] = adx_28.adx() # EMAs for length in [9, 21, 55, 200]: ema = talib.trend.EMAIndicator(c, window=length).ema_indicator() df[f"f_ema_{length}"] = ema df[f"f_dist_ema_{length}"] = (c - ema) / ema * 100 df[f"f_ema_{length}_slope"] = ema.diff(5) / ema * 100 df["f_ema_order_bull"] = ( (df["f_ema_9"] > df["f_ema_21"]) & (df["f_ema_21"] > df["f_ema_55"]) & (df["f_ema_55"] > df["f_ema_200"]) ).astype(int) df["f_ema_order_bear"] = ( (df["f_ema_9"] < df["f_ema_21"]) & (df["f_ema_21"] < df["f_ema_55"]) & (df["f_ema_55"] < df["f_ema_200"]) ).astype(int) # Bollinger Bands bb = talib.volatility.BollingerBands(c, window=20, window_dev=2) df["f_bb_upper"] = bb.bollinger_hband() df["f_bb_mid"] = bb.bollinger_mavg() df["f_bb_lower"] = bb.bollinger_lband() df["f_bb_width"] = (df["f_bb_upper"] - df["f_bb_lower"]) / df["f_bb_mid"] * 100 df["f_bb_position"] = (c - df["f_bb_lower"]) / (df["f_bb_upper"] - df["f_bb_lower"]).replace(0, np.nan) # Stochastic stoch = talib.momentum.StochasticOscillator(h, l, c, window=14, smooth_window=3) df["f_stoch_k"] = stoch.stoch() df["f_stoch_d"] = stoch.stoch_signal() # HH/HL/LH/LL counts swing_h = h.rolling(10).max() swing_l = l.rolling(10).min() prev_sh = swing_h.shift(10) prev_sl = swing_l.shift(10) df["f_hh_count_20"] = (swing_h > prev_sh).rolling(20).sum().fillna(0) df["f_hl_count_20"] = (swing_l > prev_sl).rolling(20).sum().fillna(0) df["f_ll_count_20"] = (swing_l < prev_sl).rolling(20).sum().fillna(0) df["f_lh_count_20"] = (swing_h < prev_sh).rolling(20).sum().fillna(0) logger.info(" ✅ B. Momentum generado") return df # ═══════════════════════════════════════════════════════════ # C. VOLUMEN Y FLUJO (~30 features) # ═══════════════════════════════════════════════════════════ def features_volumen(df: pd.DataFrame) -> pd.DataFrame: v = df["volume"] c = df["close"] qv = df.get("quote_volume", v * c) tbr = df.get("taker_buy_ratio", pd.Series(0.5, index=df.index)) df["f_vol_sma_20"] = v.rolling(20).mean() df["f_vol_ratio_20"] = v / df["f_vol_sma_20"].replace(0, np.nan) df["f_vol_ratio_5"] = v / v.rolling(5).mean().replace(0, np.nan) df["f_vol_accel"] = df["f_vol_ratio_20"].diff(3) df["f_vol_spike"] = (df["f_vol_ratio_20"] > 2.0).astype(int) obv = talib.volume.OnBalanceVolumeIndicator(c, v).on_balance_volume() df["f_obv"] = obv df["f_obv_slope_10"] = obv.diff(10) / obv.abs().replace(0, np.nan) * 100 vwap_approx = qv.rolling(20).sum() / v.rolling(20).sum().replace(0, np.nan) df["f_vwap_dev"] = (c - vwap_approx) / vwap_approx * 100 df["f_tbr"] = tbr df["f_tbr_sma_10"] = tbr.rolling(10).mean() df["f_tbr_roc_5"] = tbr.diff(5) df["f_qvol_roc_5"] = qv.pct_change(5) * 100 if "trades" in df.columns: trades = df["trades"] df["f_trades_ratio_20"] = trades / trades.rolling(20).mean().replace(0, np.nan) df["f_avg_trade_size"] = qv / trades.replace(0, np.nan) mid_price = (df["high"] + df["low"]) / 2 df["f_vol_above_mid"] = ((c > mid_price) * v).rolling(20).sum() df["f_vol_below_mid"] = ((c <= mid_price) * v).rolling(20).sum() df["f_vol_balance"] = df["f_vol_above_mid"] / (df["f_vol_above_mid"] + df["f_vol_below_mid"]).replace(0, np.nan) logger.info(" ✅ C. Volumen generado") return df # ═══════════════════════════════════════════════════════════ # D. CROSS-ASSET INTELLIGENCE (~30 features) # ═══════════════════════════════════════════════════════════ def features_cross_asset(df: pd.DataFrame, df_btc: pd.DataFrame = None, df_macro: pd.DataFrame = None) -> pd.DataFrame: c = df["close"] symbol = df["symbol"].iloc[0] if "symbol" in df.columns else "UNKNOWN" if df_btc is not None and symbol != "BTCUSDT": btc_c = df_btc["close"].reindex(df.index, method="ffill") df["f_corr_btc_30"] = c.rolling(30).corr(btc_c) df["f_corr_btc_90"] = c.rolling(90).corr(btc_c) btc_ret = btc_c.pct_change() sym_ret = c.pct_change() cov = sym_ret.rolling(30).cov(btc_ret) var = btc_ret.rolling(30).var() df["f_beta_btc_30"] = cov / var.replace(0, np.nan) for lag in [1, 3, 6]: df[f"f_lead_btc_{lag}"] = sym_ret.shift(lag).rolling(10).corr(btc_ret) df["f_spread_btc"] = (c / btc_c).pct_change(5) * 100 else: df["f_corr_btc_30"] = 1.0 df["f_corr_btc_90"] = 1.0 if df_macro is not None and not df_macro.empty: for col in df_macro.columns: macro_series = df_macro[col].reindex(df.index, method="ffill") df[f"f_{col}"] = macro_series df[f"f_{col}_roc_5d"] = macro_series.pct_change(5) * 100 logger.info(" ✅ D. Cross-asset generado") return df # ═══════════════════════════════════════════════════════════ # E. ON-CHAIN Y DERIVADOS (~30 features) # ═══════════════════════════════════════════════════════════ def features_onchain(df: pd.DataFrame, df_funding: pd.DataFrame = None, df_ls: pd.DataFrame = None, df_oi: pd.DataFrame = None) -> pd.DataFrame: if df_funding is not None and not df_funding.empty: fr = df_funding["fundingRate"].reindex(df.index, method="ffill") df["f_funding_rate"] = fr df["f_funding_rate_sma_10"] = fr.rolling(10).mean() df["f_funding_rate_extreme_pos"] = (fr > 0.001).astype(int) df["f_funding_rate_extreme_neg"] = (fr < -0.001).astype(int) df["f_funding_rate_roc"] = fr.diff(3) if df_ls is not None and not df_ls.empty: ls = df_ls["longShortRatio"].reindex(df.index, method="ffill") df["f_ls_ratio"] = ls df["f_ls_ratio_sma_10"] = ls.rolling(10).mean() df["f_ls_ratio_roc"] = ls.pct_change(5) * 100 if "longAccount" in df_ls.columns: df["f_long_pct"] = df_ls["longAccount"].reindex(df.index, method="ffill") if df_oi is not None and not df_oi.empty: oi = df_oi["sumOpenInterestValue"].reindex(df.index, method="ffill") df["f_oi_value"] = oi df["f_oi_roc_5"] = oi.pct_change(5) * 100 df["f_oi_roc_24"] = oi.pct_change(24) * 100 if "f_return_5" in df.columns: df["f_oi_price_div"] = df["f_return_5"] - df["f_oi_roc_5"] logger.info(" ✅ E. On-chain generado") return df # ═══════════════════════════════════════════════════════════ # F. SENTIMIENTO Y MACRO (~20 features) # ═══════════════════════════════════════════════════════════ def features_sentimiento(df: pd.DataFrame, df_fg: pd.DataFrame = None) -> pd.DataFrame: if df_fg is not None and not df_fg.empty: fg = df_fg["fear_greed"].reindex(df.index, method="ffill") df["f_fear_greed"] = fg df["f_fear_greed_roc_5"] = fg.diff(5) df["f_fear_greed_extreme_fear"] = (fg < 25).astype(int) df["f_fear_greed_extreme_greed"] = (fg > 75).astype(int) if isinstance(df.index, pd.DatetimeIndex): df["f_hour_of_day"] = df.index.hour df["f_day_of_week"] = df.index.dayofweek df["f_is_weekend"] = (df.index.dayofweek >= 5).astype(int) df["f_month"] = df.index.month df["f_is_q4"] = (df.index.month >= 10).astype(int) last_halving = pd.Timestamp("2024-04-20", tz="UTC") df["f_days_since_halving"] = (df.index - last_halving).days logger.info(" ✅ F. Sentimiento generado") return df # ═══════════════════════════════════════════════════════════ # ORQUESTADOR PRINCIPAL # ═══════════════════════════════════════════════════════════ def features_smc(df, swing_len=10, displ_mult=1.5, atr_len=14): """SMC features - mismos patrones de los Pine Scripts SMC v9.""" h, l, c, o = df["high"], df["low"], df["close"], df["open"] atr = talib.volatility.AverageTrueRange(h, l, c, window=atr_len).average_true_range() sh = h.rolling(swing_len).max() sl = l.rolling(swing_len).min() prev_sh = sh.shift(swing_len) prev_sl = sl.shift(swing_len) hh = (sh > prev_sh).astype(int) hl = (sl > prev_sl).astype(int) lh = (sh < prev_sh).astype(int) ll_sig = (sl < prev_sl).astype(int) df["f_smc_hh_20"] = hh.rolling(20).sum() df["f_smc_hl_20"] = hl.rolling(20).sum() df["f_smc_lh_20"] = lh.rolling(20).sum() df["f_smc_ll_20"] = ll_sig.rolling(20).sum() df["f_smc_bias_score"] = (df["f_smc_hh_20"] + df["f_smc_hl_20"]) - (df["f_smc_lh_20"] + df["f_smc_ll_20"]) df["f_smc_bias_bull"] = (df["f_smc_bias_score"] > 2).astype(int) df["f_smc_bias_bear"] = (df["f_smc_bias_score"] < -2).astype(int) body_size = (c - o).abs() df["f_smc_displacement"] = (body_size > atr * displ_mult).astype(int) df["f_smc_displacement_bull"] = ((c > o) & (body_size > atr * displ_mult)).astype(int) df["f_smc_displacement_bear"] = ((c < o) & (body_size > atr * displ_mult)).astype(int) df["f_smc_body_atr_mult"] = body_size / atr.replace(0, np.nan) prev_swing_h = h.rolling(swing_len).max().shift(1) prev_swing_l = l.rolling(swing_len).min().shift(1) df["f_smc_bos_bull"] = ((c > prev_swing_h) & (o < prev_swing_h)).astype(int) df["f_smc_bos_bear"] = ((c < prev_swing_l) & (o > prev_swing_l)).astype(int) df["f_smc_bos_bull_5"] = df["f_smc_bos_bull"].rolling(5).sum() df["f_smc_bos_bear_5"] = df["f_smc_bos_bear"].rolling(5).sum() bull_fvg = (h.shift(2) < l).astype(int) bear_fvg = (l.shift(2) > h).astype(int) df["f_smc_fvg_bull"] = bull_fvg df["f_smc_fvg_bear"] = bear_fvg df["f_smc_fvg_bull_size"] = ((l - h.shift(2)) / c * 100).clip(lower=0) df["f_smc_fvg_bear_size"] = ((l.shift(2) - h) / c * 100).clip(lower=0) df["f_smc_fvg_bull_count_10"] = bull_fvg.rolling(10).sum() df["f_smc_fvg_bear_count_10"] = bear_fvg.rolling(10).sum() prev_low = l.rolling(swing_len).min().shift(1) prev_high = h.rolling(swing_len).max().shift(1) df["f_smc_sweep_bull"] = ((l < prev_low) & (c > prev_low)).astype(int) df["f_smc_sweep_bear"] = ((h > prev_high) & (c < prev_high)).astype(int) bull_score = ( df["f_smc_bias_bull"] + df["f_smc_displacement_bull"].rolling(5).max().fillna(0) + df["f_smc_bos_bull"].rolling(5).max().fillna(0) + df["f_smc_fvg_bull"].rolling(3).max().fillna(0) + df["f_smc_sweep_bull"].rolling(10).max().fillna(0) ) df["f_smc_confluence_bull"] = bull_score bear_score = ( df["f_smc_bias_bear"] + df["f_smc_displacement_bear"].rolling(5).max().fillna(0) + df["f_smc_bos_bear"].rolling(5).max().fillna(0) + df["f_smc_fvg_bear"].rolling(3).max().fillna(0) + df["f_smc_sweep_bear"].rolling(10).max().fillna(0) ) df["f_smc_confluence_bear"] = bear_score df["f_smc_confluence_net"] = bull_score - bear_score df["f_smc_full_setup_bull"] = (bull_score >= 5).astype(int) df["f_smc_full_setup_bear"] = (bear_score >= 5).astype(int) logger.info(" G. Smart Money Concepts generado") return df def generate_features(symbol: str, timeframe: str = "4h", btc_symbol: str = "BTCUSDT") -> pd.DataFrame: logger.info("🔧 Generando features para %s %s...", symbol, timeframe) klines_path = os.path.join(DATA_DIR, f"klines_{symbol}_{timeframe}.parquet") if not os.path.exists(klines_path): logger.error("❌ No se encontró %s", klines_path) return pd.DataFrame() df = pd.read_parquet(klines_path) logger.info(" 📊 %d velas cargadas", len(df)) df_btc = None if symbol != btc_symbol: btc_path = os.path.join(DATA_DIR, f"klines_{btc_symbol}_{timeframe}.parquet") if os.path.exists(btc_path): df_btc = pd.read_parquet(btc_path) df_funding = _load_parquet(f"funding_{symbol}.parquet") df_ls = _load_parquet(f"longshort_{symbol}.parquet") df_oi = _load_parquet(f"oi_{symbol}.parquet") df_macro = _load_parquet("macro.parquet") df_fg = _load_parquet("fear_greed.parquet") df = features_microestructura(df) df = features_momentum(df) df = features_volumen(df) df = features_cross_asset(df, df_btc=df_btc, df_macro=df_macro) df = features_onchain(df, df_funding=df_funding, df_ls=df_ls, df_oi=df_oi) df = features_sentimiento(df, df_fg=df_fg) df = features_smc(df) feature_cols = [c for c in df.columns if c.startswith("f_")] df[feature_cols] = df[feature_cols].replace([np.inf, -np.inf], np.nan) warmup = 210 df = df.iloc[warmup:] n_features = len(feature_cols) nan_pct = df[feature_cols].isna().mean().mean() * 100 logger.info(" 📊 %d velas × %d features (NaN: %.1f%%)", len(df), n_features, nan_pct) return df def _load_parquet(filename: str) -> pd.DataFrame: path = os.path.join(DATA_DIR, filename) if os.path.exists(path): return pd.read_parquet(path) return pd.DataFrame() def get_feature_columns(df: pd.DataFrame) -> list[str]: return sorted([c for c in df.columns if c.startswith("f_")]) def main(): parser = argparse.ArgumentParser(description="Aurora Brain — Feature Engine") parser.add_argument("--symbol", default="BTCUSDT") parser.add_argument("--timeframe", default="4h") parser.add_argument("--all", action="store_true") args = parser.parse_args() symbols = [ "BTCUSDT", "ETHUSDT", "SOLUSDT", "LINKUSDT", "TAOUSDT", "WLDUSDT", "VIRTUALUSDT", "FETUSDT", "INJUSDT", "GRTUSDT", "KITEUSDT", "THETAUSDT", "KAITOUSDT", "SENTUSDT", "LPTUSDT", "AWEUSDT", "TURBOUSDT", "SAHARAUSDT", "VANAUSDT", "NMRUSDT", "OPENUSDT", "ROBOUSDT", "HOLOUSDT", "RLCUSDT", "IOUSDT", "PHAUSDT", "IQUSDT", "AIXBTUSDT", "SAPIENUSDT", "FLUXUSDT", "ALLOUSDT", "MIRAUSDT", ] if args.all else [args.symbol] for symbol in symbols: df = generate_features(symbol, args.timeframe) if not df.empty: path = os.path.join(DATA_DIR, f"features_{symbol}_{args.timeframe}.parquet") df.to_parquet(path) logger.info("💾 Guardado: %s", path) if __name__ == "__main__": main()