algorembrant's picture
Upload 61 files
9cb5a00 verified
"""
ML-3m-trader Feature Engineering
=================================
Vectorized computation of all technical indicators using NumPy/Pandas.
No external TA library required.
"""
import numpy as np
import pandas as pd
import config as cfg
# ---------------------------------------------------------------------------
# Individual indicator functions (all operate on NumPy arrays or Series)
# ---------------------------------------------------------------------------
def sma(series: pd.Series, period: int) -> pd.Series:
"""Simple Moving Average."""
return series.rolling(window=period, min_periods=period).mean()
def double_moving_average_signal(close: pd.Series) -> pd.Series:
"""
Double Moving Average crossover signal.
1 = fast above slow (bullish)
-1 = fast below slow (bearish)
0 = undefined (insufficient data)
"""
fast = sma(close, cfg.SMA_FAST_PERIOD)
slow = sma(close, cfg.SMA_SLOW_PERIOD)
signal = pd.Series(np.where(fast > slow, 1, np.where(fast < slow, -1, 0)),
index=close.index)
signal[slow.isna()] = 0
return signal
def vroc(volume: pd.Series, period: int = cfg.VROC_PERIOD) -> pd.Series:
"""Volume Rate of Change (percentage)."""
prev = volume.shift(period)
return ((volume - prev) / prev.replace(0, np.nan)) * 100.0
def synthetic_vix(close: pd.Series, period: int = cfg.VIX_PROXY_PERIOD) -> pd.Series:
"""
Synthetic VIX proxy: annualized rolling standard deviation of
log-returns, expressed as a percentage.
"""
log_ret = np.log(close / close.shift(1))
# Annualize: sqrt(bars_per_year) where bars_per_year ≈ 252 * (6.5h*60/3)
bars_per_day = (6.5 * 60) / cfg.TIMEFRAME_MINUTES # ~130 bars/day
annual_factor = np.sqrt(252 * bars_per_day)
rolling_std = log_ret.rolling(window=period, min_periods=period).std()
return rolling_std * annual_factor * 100.0
def momentum_strength_index(close: pd.Series,
period: int = cfg.MOMENTUM_SI_PERIOD) -> pd.Series:
"""
Momentum Strength Index (MSI): measures the ratio of positive-momentum
bars to total bars over *period*, scaled 0-100. Similar concept to RSI
but purely count-based rather than magnitude-based.
"""
delta = close.diff()
up = (delta > 0).astype(float)
msi = up.rolling(window=period, min_periods=period).sum() / period * 100.0
return msi
def _wilder_smooth(values: pd.Series, period: int) -> pd.Series:
"""Wilder's exponential smoothing (used by ADX)."""
result = values.copy()
result.iloc[:period] = np.nan
result.iloc[period - 1] = values.iloc[:period].sum() # seed
alpha = 1.0 / period
for i in range(period, len(values)):
result.iloc[i] = result.iloc[i - 1] * (1 - alpha) + values.iloc[i] * alpha
return result
def adx(high: pd.Series, low: pd.Series, close: pd.Series,
period: int = cfg.ADX_PERIOD) -> pd.Series:
"""
Average Directional Index via Wilder's method.
Returns the ADX line (0-100 scale).
"""
prev_high = high.shift(1)
prev_low = low.shift(1)
prev_close = close.shift(1)
tr1 = high - low
tr2 = (high - prev_close).abs()
tr3 = (low - prev_close).abs()
tr = pd.concat([tr1, tr2, tr3], axis=1).max(axis=1)
plus_dm = np.where((high - prev_high) > (prev_low - low),
np.maximum(high - prev_high, 0), 0)
minus_dm = np.where((prev_low - low) > (high - prev_high),
np.maximum(prev_low - low, 0), 0)
plus_dm = pd.Series(plus_dm, index=high.index, dtype=float)
minus_dm = pd.Series(minus_dm, index=high.index, dtype=float)
atr = _wilder_smooth(tr, period)
smooth_plus = _wilder_smooth(plus_dm, period)
smooth_minus = _wilder_smooth(minus_dm, period)
plus_di = 100.0 * smooth_plus / atr.replace(0, np.nan)
minus_di = 100.0 * smooth_minus / atr.replace(0, np.nan)
dx = 100.0 * (plus_di - minus_di).abs() / (plus_di + minus_di).replace(0, np.nan)
adx_line = _wilder_smooth(dx, period)
return adx_line
def time_features(dt_series: pd.Series) -> pd.DataFrame:
"""
Extract cyclical time features from a datetime Series.
Uses sin/cos encoding for hour, minute, day-of-week.
"""
hour = dt_series.dt.hour + dt_series.dt.minute / 60.0
dow = dt_series.dt.dayofweek # 0=Monday
minute = dt_series.dt.minute
return pd.DataFrame({
"hour_sin": np.sin(2 * np.pi * hour / 24.0),
"hour_cos": np.cos(2 * np.pi * hour / 24.0),
"minute_sin": np.sin(2 * np.pi * minute / 60.0),
"minute_cos": np.cos(2 * np.pi * minute / 60.0),
"dow_sin": np.sin(2 * np.pi * dow / 5.0),
"dow_cos": np.cos(2 * np.pi * dow / 5.0),
}, index=dt_series.index)
# ---------------------------------------------------------------------------
# Master feature builder
# ---------------------------------------------------------------------------
def build_features(df: pd.DataFrame) -> pd.DataFrame:
"""
Compute all technical features from raw OHLCV data.
Parameters
----------
df : pd.DataFrame
Must contain columns: time, open, high, low, close, volume
Returns
-------
pd.DataFrame
Original columns plus all computed features.
NaN rows from lookback periods are dropped.
"""
out = df.copy()
# Price-based
out["sma_fast"] = sma(out["close"], cfg.SMA_FAST_PERIOD)
out["sma_slow"] = sma(out["close"], cfg.SMA_SLOW_PERIOD)
out["dma_signal"] = double_moving_average_signal(out["close"])
# Volume
out["vroc"] = vroc(out["volume"], cfg.VROC_PERIOD)
# Volatility
out["vix_proxy"] = synthetic_vix(out["close"], cfg.VIX_PROXY_PERIOD)
# Momentum
out["msi"] = momentum_strength_index(out["close"], cfg.MOMENTUM_SI_PERIOD)
# Trend
out["adx"] = adx(out["high"], out["low"], out["close"], cfg.ADX_PERIOD)
# Time
if not pd.api.types.is_datetime64_any_dtype(out["time"]):
out["time"] = pd.to_datetime(out["time"])
time_feats = time_features(out["time"])
out = pd.concat([out, time_feats], axis=1)
# Drop rows with NaN from indicator warm-up
out.dropna(inplace=True)
out.reset_index(drop=True, inplace=True)
print(f"[INFO] Features built: {out.shape[1]} columns, {len(out):,} rows "
f"(dropped {len(df) - len(out):,} warm-up rows)")
return out
def get_feature_columns() -> list:
"""Return the list of feature column names used for model input."""
return [
"open", "high", "low", "close", "volume",
"sma_fast", "sma_slow", "dma_signal",
"vroc", "vix_proxy", "msi", "adx",
"hour_sin", "hour_cos",
"minute_sin", "minute_cos",
"dow_sin", "dow_cos",
]