""" ML-3m-trader Feature Engineering ================================= Vectorized computation of all technical indicators using NumPy/Pandas. No external TA library required. """ import numpy as np import pandas as pd import config as cfg # --------------------------------------------------------------------------- # Individual indicator functions (all operate on NumPy arrays or Series) # --------------------------------------------------------------------------- def sma(series: pd.Series, period: int) -> pd.Series: """Simple Moving Average.""" return series.rolling(window=period, min_periods=period).mean() def double_moving_average_signal(close: pd.Series) -> pd.Series: """ Double Moving Average crossover signal. 1 = fast above slow (bullish) -1 = fast below slow (bearish) 0 = undefined (insufficient data) """ fast = sma(close, cfg.SMA_FAST_PERIOD) slow = sma(close, cfg.SMA_SLOW_PERIOD) signal = pd.Series(np.where(fast > slow, 1, np.where(fast < slow, -1, 0)), index=close.index) signal[slow.isna()] = 0 return signal def vroc(volume: pd.Series, period: int = cfg.VROC_PERIOD) -> pd.Series: """Volume Rate of Change (percentage).""" prev = volume.shift(period) return ((volume - prev) / prev.replace(0, np.nan)) * 100.0 def synthetic_vix(close: pd.Series, period: int = cfg.VIX_PROXY_PERIOD) -> pd.Series: """ Synthetic VIX proxy: annualized rolling standard deviation of log-returns, expressed as a percentage. """ log_ret = np.log(close / close.shift(1)) # Annualize: sqrt(bars_per_year) where bars_per_year ≈ 252 * (6.5h*60/3) bars_per_day = (6.5 * 60) / cfg.TIMEFRAME_MINUTES # ~130 bars/day annual_factor = np.sqrt(252 * bars_per_day) rolling_std = log_ret.rolling(window=period, min_periods=period).std() return rolling_std * annual_factor * 100.0 def momentum_strength_index(close: pd.Series, period: int = cfg.MOMENTUM_SI_PERIOD) -> pd.Series: """ Momentum Strength Index (MSI): measures the ratio of positive-momentum bars to total bars over *period*, scaled 0-100. Similar concept to RSI but purely count-based rather than magnitude-based. """ delta = close.diff() up = (delta > 0).astype(float) msi = up.rolling(window=period, min_periods=period).sum() / period * 100.0 return msi def _wilder_smooth(values: pd.Series, period: int) -> pd.Series: """Wilder's exponential smoothing (used by ADX).""" result = values.copy() result.iloc[:period] = np.nan result.iloc[period - 1] = values.iloc[:period].sum() # seed alpha = 1.0 / period for i in range(period, len(values)): result.iloc[i] = result.iloc[i - 1] * (1 - alpha) + values.iloc[i] * alpha return result def adx(high: pd.Series, low: pd.Series, close: pd.Series, period: int = cfg.ADX_PERIOD) -> pd.Series: """ Average Directional Index via Wilder's method. Returns the ADX line (0-100 scale). """ prev_high = high.shift(1) prev_low = low.shift(1) prev_close = close.shift(1) tr1 = high - low tr2 = (high - prev_close).abs() tr3 = (low - prev_close).abs() tr = pd.concat([tr1, tr2, tr3], axis=1).max(axis=1) plus_dm = np.where((high - prev_high) > (prev_low - low), np.maximum(high - prev_high, 0), 0) minus_dm = np.where((prev_low - low) > (high - prev_high), np.maximum(prev_low - low, 0), 0) plus_dm = pd.Series(plus_dm, index=high.index, dtype=float) minus_dm = pd.Series(minus_dm, index=high.index, dtype=float) atr = _wilder_smooth(tr, period) smooth_plus = _wilder_smooth(plus_dm, period) smooth_minus = _wilder_smooth(minus_dm, period) plus_di = 100.0 * smooth_plus / atr.replace(0, np.nan) minus_di = 100.0 * smooth_minus / atr.replace(0, np.nan) dx = 100.0 * (plus_di - minus_di).abs() / (plus_di + minus_di).replace(0, np.nan) adx_line = _wilder_smooth(dx, period) return adx_line def time_features(dt_series: pd.Series) -> pd.DataFrame: """ Extract cyclical time features from a datetime Series. Uses sin/cos encoding for hour, minute, day-of-week. """ hour = dt_series.dt.hour + dt_series.dt.minute / 60.0 dow = dt_series.dt.dayofweek # 0=Monday minute = dt_series.dt.minute return pd.DataFrame({ "hour_sin": np.sin(2 * np.pi * hour / 24.0), "hour_cos": np.cos(2 * np.pi * hour / 24.0), "minute_sin": np.sin(2 * np.pi * minute / 60.0), "minute_cos": np.cos(2 * np.pi * minute / 60.0), "dow_sin": np.sin(2 * np.pi * dow / 5.0), "dow_cos": np.cos(2 * np.pi * dow / 5.0), }, index=dt_series.index) # --------------------------------------------------------------------------- # Master feature builder # --------------------------------------------------------------------------- def build_features(df: pd.DataFrame) -> pd.DataFrame: """ Compute all technical features from raw OHLCV data. Parameters ---------- df : pd.DataFrame Must contain columns: time, open, high, low, close, volume Returns ------- pd.DataFrame Original columns plus all computed features. NaN rows from lookback periods are dropped. """ out = df.copy() # Price-based out["sma_fast"] = sma(out["close"], cfg.SMA_FAST_PERIOD) out["sma_slow"] = sma(out["close"], cfg.SMA_SLOW_PERIOD) out["dma_signal"] = double_moving_average_signal(out["close"]) # Volume out["vroc"] = vroc(out["volume"], cfg.VROC_PERIOD) # Volatility out["vix_proxy"] = synthetic_vix(out["close"], cfg.VIX_PROXY_PERIOD) # Momentum out["msi"] = momentum_strength_index(out["close"], cfg.MOMENTUM_SI_PERIOD) # Trend out["adx"] = adx(out["high"], out["low"], out["close"], cfg.ADX_PERIOD) # Time if not pd.api.types.is_datetime64_any_dtype(out["time"]): out["time"] = pd.to_datetime(out["time"]) time_feats = time_features(out["time"]) out = pd.concat([out, time_feats], axis=1) # Drop rows with NaN from indicator warm-up out.dropna(inplace=True) out.reset_index(drop=True, inplace=True) print(f"[INFO] Features built: {out.shape[1]} columns, {len(out):,} rows " f"(dropped {len(df) - len(out):,} warm-up rows)") return out def get_feature_columns() -> list: """Return the list of feature column names used for model input.""" return [ "open", "high", "low", "close", "volume", "sma_fast", "sma_slow", "dma_signal", "vroc", "vix_proxy", "msi", "adx", "hour_sin", "hour_cos", "minute_sin", "minute_cos", "dow_sin", "dow_cos", ]