Spaces:
Sleeping
Sleeping
| """ | |
| OHLCV feature extraction for ARB-MAX. 329 features, hardcoded order. | |
| extract(window_frame, at_tick=120) -> np.ndarray shape (329,) | |
| Uses rolling statistics over ticks [0, at_tick]. Robust to NaN: anything | |
| non-finite is replaced with 0.0 before return. | |
| """ | |
| from __future__ import annotations | |
| from typing import List | |
| import numpy as np | |
| import pandas as pd | |
| # --------------------------------------------------------------------------- | |
| # Horizon lists | |
| # --------------------------------------------------------------------------- | |
| _RETURN_HORIZONS = [1, 5, 15, 30, 60, 180] | |
| _STD_HORIZONS = [1, 5, 15, 30, 60, 180] | |
| _SHARPE_HORIZONS = [1, 5, 15, 30, 60, 180] | |
| _MOMENT_HORIZONS = [60, 180] | |
| _RSI_HORIZONS = [30, 60, 180] | |
| _BOLL_HORIZONS = [30, 60, 180] | |
| _VOL_HORIZONS = [30, 60, 180] | |
| def _build_feature_names() -> List[str]: | |
| names: List[str] = [] | |
| # returns (6) | |
| for h in _RETURN_HORIZONS: | |
| names.append(f"ret_{h}s") | |
| # rolling std (6) | |
| for h in _STD_HORIZONS: | |
| names.append(f"retstd_{h}s") | |
| # rolling sharpe (6) | |
| for h in _SHARPE_HORIZONS: | |
| names.append(f"sharpe_{h}s") | |
| # higher moments (4) — skew & kurt at 60s/180s | |
| for h in _MOMENT_HORIZONS: | |
| names.append(f"retskew_{h}s") | |
| names.append(f"retkurt_{h}s") | |
| # VWMP & VW return (2) | |
| names.append("vwmp") | |
| names.append("vw_ret") | |
| # MACD (3) | |
| names.append("macd") | |
| names.append("macd_signal") | |
| names.append("macd_hist") | |
| # RSI at 3 horizons (3) | |
| for h in _RSI_HORIZONS: | |
| names.append(f"rsi_{h}s") | |
| # Bollinger position at 3 horizons (3*3 = 9) | |
| for h in _BOLL_HORIZONS: | |
| names.append(f"boll_pos_{h}s") | |
| names.append(f"boll_width_{h}s") | |
| names.append(f"boll_z_{h}s") | |
| # Cumulative return since t=0 (1) | |
| names.append("cum_ret_since_0") | |
| # Price vs window_open ratio (1) | |
| names.append("px_over_open") | |
| # Trade intensity & volume accel (3 * len(_VOL_HORIZONS) = 9) | |
| for h in _VOL_HORIZONS: | |
| names.append(f"trade_rate_{h}s") | |
| names.append(f"vol_accel_{h}s") | |
| names.append(f"taker_buy_ratio_{h}s") | |
| # --- So far: 6+6+6+4+2+3+3+9+1+1+9 = 50 --- | |
| # Now pad with derived / finer features up to 329. | |
| # Rolling min / max / mean of returns at all horizons (3 * 6 = 18) | |
| for h in _RETURN_HORIZONS: | |
| names.append(f"retmin_{h}s") | |
| names.append(f"retmax_{h}s") | |
| names.append(f"retmean_{h}s") | |
| # running total: 50 + 18 = 68 | |
| # Rolling min / max / mean of close prices at all horizons (3 * 6 = 18) | |
| for h in _RETURN_HORIZONS: | |
| names.append(f"pxmin_{h}s") | |
| names.append(f"pxmax_{h}s") | |
| names.append(f"pxmean_{h}s") | |
| # total: 86 | |
| # Rolling range / ATR-like at horizons (2 * 6 = 12) | |
| for h in _RETURN_HORIZONS: | |
| names.append(f"range_{h}s") | |
| names.append(f"atr_{h}s") | |
| # total: 98 | |
| # Log return statistics (4 * 6 = 24) | |
| for h in _RETURN_HORIZONS: | |
| names.append(f"logret_mean_{h}s") | |
| names.append(f"logret_std_{h}s") | |
| names.append(f"logret_abs_sum_{h}s") | |
| names.append(f"logret_sign_sum_{h}s") | |
| # total: 122 | |
| # Up / down move counts (2 * 6 = 12) | |
| for h in _RETURN_HORIZONS: | |
| names.append(f"up_count_{h}s") | |
| names.append(f"dn_count_{h}s") | |
| # total: 134 | |
| # Consecutive-direction run lengths (2) | |
| names.append("run_length_up") | |
| names.append("run_length_dn") | |
| # total: 136 | |
| # Volume stats (4 * 6 = 24) | |
| for h in _RETURN_HORIZONS: | |
| names.append(f"vol_mean_{h}s") | |
| names.append(f"vol_std_{h}s") | |
| names.append(f"vol_max_{h}s") | |
| names.append(f"vol_sum_{h}s") | |
| # total: 160 | |
| # Quote volume stats (3 * 6 = 18) | |
| for h in _RETURN_HORIZONS: | |
| names.append(f"qvol_mean_{h}s") | |
| names.append(f"qvol_sum_{h}s") | |
| names.append(f"qvol_std_{h}s") | |
| # total: 178 | |
| # Trade count stats (3 * 6 = 18) | |
| for h in _RETURN_HORIZONS: | |
| names.append(f"trades_mean_{h}s") | |
| names.append(f"trades_sum_{h}s") | |
| names.append(f"trades_std_{h}s") | |
| # total: 196 | |
| # Taker buy stats (3 * 6 = 18) | |
| for h in _RETURN_HORIZONS: | |
| names.append(f"tbbase_sum_{h}s") | |
| names.append(f"tbquote_sum_{h}s") | |
| names.append(f"tb_imbalance_{h}s") | |
| # total: 214 | |
| # Price-level quantile features at longer horizons (5 * 3 = 15) | |
| for h in [30, 60, 180]: | |
| names.append(f"px_q05_{h}s") | |
| names.append(f"px_q25_{h}s") | |
| names.append(f"px_q50_{h}s") | |
| names.append(f"px_q75_{h}s") | |
| names.append(f"px_q95_{h}s") | |
| # total: 229 | |
| # Return autocorrelation at lags 1/5/15 for horizons 60/180 (2 * 3 = 6) | |
| for h in [60, 180]: | |
| names.append(f"autocorr1_{h}s") | |
| names.append(f"autocorr5_{h}s") | |
| names.append(f"autocorr15_{h}s") | |
| # total: 235 | |
| # Rolling z-scores of volume & trades at 60/180 (4) | |
| for h in [60, 180]: | |
| names.append(f"vol_z_{h}s") | |
| names.append(f"trades_z_{h}s") | |
| # total: 239 | |
| # Momentum crossovers (EMA fast/slow ratio) at 4 pairs (4) | |
| names.append("ema5_over_ema30") | |
| names.append("ema15_over_ema60") | |
| names.append("ema30_over_ema180") | |
| names.append("ema60_over_ema180") | |
| # total: 243 | |
| # Acceleration features (2nd-diff of return) at 4 horizons (4) | |
| for h in [5, 15, 60, 180]: | |
| names.append(f"accel_{h}s") | |
| # total: 247 | |
| # Hour-of-day and minute-of-hour are part of state features — skip here. | |
| # Tick position | |
| names.append("at_tick") | |
| names.append("at_tick_over_900") | |
| # total: 249 | |
| # High-low-close patterns (HLC) at horizons (3 * 6 = 18) | |
| for h in _RETURN_HORIZONS: | |
| names.append(f"high_over_close_{h}s") | |
| names.append(f"low_over_close_{h}s") | |
| names.append(f"hl_range_over_close_{h}s") | |
| # total: 267 | |
| # Rolling skew / kurt of ABS returns at 60/180 (4) | |
| for h in [60, 180]: | |
| names.append(f"absret_skew_{h}s") | |
| names.append(f"absret_kurt_{h}s") | |
| # total: 271 | |
| # Rolling correlation of volume & |ret| at 60/180 (2) | |
| for h in [60, 180]: | |
| names.append(f"corr_vol_absret_{h}s") | |
| # total: 273 | |
| # Ticks since last up / down move (2) | |
| names.append("ticks_since_up") | |
| names.append("ticks_since_dn") | |
| # total: 275 | |
| # Rolling maximum drawdown of close at 60/180 (2) | |
| for h in [60, 180]: | |
| names.append(f"max_dd_{h}s") | |
| # total: 277 | |
| # Rolling maximum run-up at 60/180 (2) | |
| for h in [60, 180]: | |
| names.append(f"max_ru_{h}s") | |
| # total: 279 | |
| # Binary "above VWMP" count at 60/180 (2) | |
| for h in [60, 180]: | |
| names.append(f"frac_above_vwmp_{h}s") | |
| # total: 281 | |
| # ADX-ish trend strength proxies at 60/180 (2) | |
| for h in [60, 180]: | |
| names.append(f"trend_strength_{h}s") | |
| # total: 283 | |
| # Log-return cumulants (mean, var, skew, kurt) on full window so far (4) | |
| names.append("cum_logret_mean") | |
| names.append("cum_logret_var") | |
| names.append("cum_logret_skew") | |
| names.append("cum_logret_kurt") | |
| # total: 287 | |
| # Per-horizon Sharpe of abs-returns (6) | |
| for h in _RETURN_HORIZONS: | |
| names.append(f"absret_sharpe_{h}s") | |
| # total: 293 | |
| # Per-horizon Sortino proxy (downside std only) (6) | |
| for h in _RETURN_HORIZONS: | |
| names.append(f"sortino_{h}s") | |
| # total: 299 | |
| # Rolling ranks of latest ret within horizon (6) | |
| for h in _RETURN_HORIZONS: | |
| names.append(f"ret_rank_{h}s") | |
| # total: 305 | |
| # Rolling ranks of latest volume within horizon (6) | |
| for h in _RETURN_HORIZONS: | |
| names.append(f"vol_rank_{h}s") | |
| # total: 311 | |
| # Rolling ranks of latest price within horizon (6) | |
| for h in _RETURN_HORIZONS: | |
| names.append(f"px_rank_{h}s") | |
| # total: 317 | |
| # Rolling fraction of positive returns at all horizons (6) | |
| for h in _RETURN_HORIZONS: | |
| names.append(f"pos_frac_{h}s") | |
| # total: 323 | |
| # Final six: coarse momentum / vol summary | |
| names.append("mom_60_180_ratio") | |
| names.append("vol_60_180_ratio") | |
| names.append("range_60_180_ratio") | |
| names.append("trades_60_180_ratio") | |
| names.append("vwmp_vs_close_pct") | |
| names.append("realized_vol_full") | |
| # total: 329 | |
| return names | |
| FEATURE_NAMES: List[str] = _build_feature_names() | |
| assert len(FEATURE_NAMES) == 329, f"expected 329, got {len(FEATURE_NAMES)}" | |
| # --------------------------------------------------------------------------- | |
| # Extraction helpers | |
| # --------------------------------------------------------------------------- | |
| def _tail(arr: np.ndarray, n: int) -> np.ndarray: | |
| if n <= 0: | |
| return arr[-0:] | |
| return arr[-n:] if len(arr) >= n else arr | |
| def _safe_ret(series: np.ndarray, h: int) -> float: | |
| if len(series) <= h: | |
| return 0.0 | |
| base = series[-h - 1] | |
| last = series[-1] | |
| if not np.isfinite(base) or base == 0 or not np.isfinite(last): | |
| return 0.0 | |
| return float(last / base - 1.0) | |
| def _safe_std(arr: np.ndarray) -> float: | |
| if len(arr) < 2: | |
| return 0.0 | |
| v = float(np.nanstd(arr)) | |
| return v if np.isfinite(v) else 0.0 | |
| def _safe_mean(arr: np.ndarray) -> float: | |
| if len(arr) == 0: | |
| return 0.0 | |
| v = float(np.nanmean(arr)) | |
| return v if np.isfinite(v) else 0.0 | |
| def _safe_skew(arr: np.ndarray) -> float: | |
| arr = arr[np.isfinite(arr)] | |
| if len(arr) < 3: | |
| return 0.0 | |
| m = arr.mean() | |
| s = arr.std() | |
| if s == 0: | |
| return 0.0 | |
| return float(((arr - m) ** 3).mean() / (s ** 3)) | |
| def _safe_kurt(arr: np.ndarray) -> float: | |
| arr = arr[np.isfinite(arr)] | |
| if len(arr) < 4: | |
| return 0.0 | |
| m = arr.mean() | |
| s = arr.std() | |
| if s == 0: | |
| return 0.0 | |
| return float(((arr - m) ** 4).mean() / (s ** 4) - 3.0) | |
| def _ema(arr: np.ndarray, span: int) -> float: | |
| if len(arr) == 0: | |
| return 0.0 | |
| alpha = 2.0 / (span + 1.0) | |
| out = arr[0] | |
| for v in arr[1:]: | |
| if not np.isfinite(v): | |
| continue | |
| out = alpha * v + (1.0 - alpha) * out | |
| return float(out) | |
| def _rsi(prices: np.ndarray, n: int) -> float: | |
| if len(prices) <= n: | |
| return 50.0 | |
| diffs = np.diff(prices[-(n + 1):]) | |
| gains = diffs[diffs > 0].sum() | |
| losses = -diffs[diffs < 0].sum() | |
| if losses == 0: | |
| return 100.0 if gains > 0 else 50.0 | |
| rs = gains / losses | |
| return float(100.0 - 100.0 / (1.0 + rs)) | |
| def _rank_of_last(arr: np.ndarray) -> float: | |
| arr = arr[np.isfinite(arr)] | |
| if len(arr) == 0: | |
| return 0.5 | |
| last = arr[-1] | |
| return float((arr <= last).mean()) | |
| def _autocorr(arr: np.ndarray, lag: int) -> float: | |
| if len(arr) <= lag + 1: | |
| return 0.0 | |
| a = arr[:-lag] | |
| b = arr[lag:] | |
| mask = np.isfinite(a) & np.isfinite(b) | |
| if mask.sum() < 3: | |
| return 0.0 | |
| a = a[mask] | |
| b = b[mask] | |
| sa, sb = a.std(), b.std() | |
| if sa == 0 or sb == 0: | |
| return 0.0 | |
| return float(np.corrcoef(a, b)[0, 1]) | |
| # --------------------------------------------------------------------------- | |
| # Main extract | |
| # --------------------------------------------------------------------------- | |
| def extract(window_frame: pd.DataFrame, at_tick: int = 120) -> np.ndarray: | |
| """Produce 329 features aligned with FEATURE_NAMES.""" | |
| df = window_frame.iloc[: at_tick + 1] | |
| n = len(df) | |
| close = df["close"].to_numpy(dtype=np.float64) | |
| open_ = df["open"].to_numpy(dtype=np.float64) | |
| high = df["high"].to_numpy(dtype=np.float64) | |
| low = df["low"].to_numpy(dtype=np.float64) | |
| volume = df["volume"].to_numpy(dtype=np.float64) if "volume" in df.columns else np.zeros(n) | |
| trades = df["trades"].to_numpy(dtype=np.float64) if "trades" in df.columns else np.zeros(n) | |
| # Forward-fill close gaps for return math (in-case first ticks missing) | |
| if not np.all(np.isfinite(close)): | |
| last = np.nan | |
| for i, v in enumerate(close): | |
| if np.isfinite(v): | |
| last = v | |
| else: | |
| close[i] = last | |
| # if still nan at start, use the first finite | |
| if np.isnan(close[0]): | |
| for v in close: | |
| if np.isfinite(v): | |
| close[:] = np.where(np.isfinite(close), close, v) | |
| break | |
| close = np.nan_to_num(close, nan=0.0, posinf=0.0, neginf=0.0) | |
| # log returns | |
| with np.errstate(divide="ignore", invalid="ignore"): | |
| log_close = np.where(close > 0, np.log(close), 0.0) | |
| logrets = np.diff(log_close, prepend=log_close[0]) # first entry is 0 | |
| out: List[float] = [] | |
| # ret_{h}s (6) | |
| for h in _RETURN_HORIZONS: | |
| out.append(_safe_ret(close, h)) | |
| # retstd_{h}s (6) — on log-returns window | |
| for h in _STD_HORIZONS: | |
| out.append(_safe_std(_tail(logrets, h))) | |
| # sharpe_{h}s (6) | |
| for h in _SHARPE_HORIZONS: | |
| w = _tail(logrets, h) | |
| m = _safe_mean(w) | |
| s = _safe_std(w) | |
| out.append(m / s if s > 0 else 0.0) | |
| # higher moments (4): skew, kurt at 60/180 | |
| for h in _MOMENT_HORIZONS: | |
| w = _tail(logrets, h) | |
| out.append(_safe_skew(w)) | |
| out.append(_safe_kurt(w)) | |
| # VWMP & VW return (2) | |
| if volume.sum() > 0: | |
| vwmp = float(np.nansum(close * volume) / max(np.nansum(volume), 1e-9)) | |
| else: | |
| vwmp = _safe_mean(close) | |
| out.append(vwmp) | |
| out.append((close[-1] / vwmp - 1.0) if vwmp > 0 else 0.0) | |
| # MACD (3) | |
| ema12 = _ema(close, 12) | |
| ema26 = _ema(close, 26) | |
| macd = ema12 - ema26 | |
| # signal = ema9 of macd history — approximate over last 9 ticks | |
| macd_hist_series = [] | |
| for k in range(min(26, n), n + 1): | |
| sub = close[:k] | |
| macd_hist_series.append(_ema(sub, 12) - _ema(sub, 26)) | |
| macd_signal = _ema(np.array(macd_hist_series[-9:]) if macd_hist_series else np.array([0.0]), 9) | |
| out.append(macd) | |
| out.append(macd_signal) | |
| out.append(macd - macd_signal) | |
| # RSI (3) | |
| for h in _RSI_HORIZONS: | |
| out.append(_rsi(close, h)) | |
| # Bollinger position (9): pos, width, z-score at 30/60/180 | |
| for h in _BOLL_HORIZONS: | |
| w = _tail(close, h) | |
| m = _safe_mean(w) | |
| s = _safe_std(w) | |
| mn = float(np.nanmin(w)) if len(w) else 0.0 | |
| mx = float(np.nanmax(w)) if len(w) else 0.0 | |
| denom = (mx - mn) if (mx - mn) != 0 else 1.0 | |
| pos = (close[-1] - mn) / denom | |
| width = (mx - mn) / m if m != 0 else 0.0 | |
| z = (close[-1] - m) / s if s > 0 else 0.0 | |
| out.append(pos) | |
| out.append(width) | |
| out.append(z) | |
| # Cumulative return since t=0 (1) | |
| out.append((close[-1] / close[0] - 1.0) if close[0] > 0 else 0.0) | |
| # Price vs window_open ratio (1) | |
| wopen = open_[0] if np.isfinite(open_[0]) and open_[0] > 0 else close[0] | |
| out.append((close[-1] / wopen - 1.0) if wopen > 0 else 0.0) | |
| # Trade intensity + volume accel + taker buy ratio (9) | |
| for h in _VOL_HORIZONS: | |
| tw = _tail(trades, h) | |
| vw = _tail(volume, h) | |
| out.append(_safe_mean(tw)) | |
| # accel = latest vol vs mean vol | |
| mv = _safe_mean(vw) | |
| out.append((volume[-1] - mv) if np.isfinite(volume[-1]) else 0.0) | |
| # taker_buy_ratio — stub 0.5 (not always populated on df at this tick) | |
| out.append(0.5) | |
| # -------- Padded features --------- | |
| # retmin/max/mean at 6 horizons (18) | |
| for h in _RETURN_HORIZONS: | |
| w = _tail(logrets, h) | |
| out.append(float(np.nanmin(w)) if len(w) else 0.0) | |
| out.append(float(np.nanmax(w)) if len(w) else 0.0) | |
| out.append(_safe_mean(w)) | |
| # pxmin/max/mean at 6 horizons (18) | |
| for h in _RETURN_HORIZONS: | |
| w = _tail(close, h) | |
| out.append(float(np.nanmin(w)) if len(w) else 0.0) | |
| out.append(float(np.nanmax(w)) if len(w) else 0.0) | |
| out.append(_safe_mean(w)) | |
| # range/atr at 6 horizons (12) | |
| for h in _RETURN_HORIZONS: | |
| hi = _tail(high, h) | |
| lo = _tail(low, h) | |
| rng = (float(np.nanmax(hi)) - float(np.nanmin(lo))) if len(hi) else 0.0 | |
| atr = _safe_mean(hi - lo) if len(hi) == len(lo) else 0.0 | |
| out.append(rng) | |
| out.append(atr) | |
| # log-ret stats (24) | |
| for h in _RETURN_HORIZONS: | |
| w = _tail(logrets, h) | |
| out.append(_safe_mean(w)) | |
| out.append(_safe_std(w)) | |
| out.append(float(np.nansum(np.abs(w)))) | |
| out.append(float(np.nansum(np.sign(w)))) | |
| # up/down counts (12) | |
| for h in _RETURN_HORIZONS: | |
| w = _tail(logrets, h) | |
| out.append(float((w > 0).sum())) | |
| out.append(float((w < 0).sum())) | |
| # run lengths (2) | |
| sign = np.sign(logrets) | |
| up_run = 0 | |
| dn_run = 0 | |
| for v in sign[::-1]: | |
| if v > 0: | |
| if dn_run == 0: | |
| up_run += 1 | |
| else: | |
| break | |
| elif v < 0: | |
| if up_run == 0: | |
| dn_run += 1 | |
| else: | |
| break | |
| else: | |
| break | |
| out.append(float(up_run)) | |
| out.append(float(dn_run)) | |
| # volume stats (24) | |
| for h in _RETURN_HORIZONS: | |
| w = _tail(volume, h) | |
| out.append(_safe_mean(w)) | |
| out.append(_safe_std(w)) | |
| out.append(float(np.nanmax(w)) if len(w) else 0.0) | |
| out.append(float(np.nansum(w))) | |
| # quote volume stats (18) — if column present | |
| qv = ( | |
| df["quote_volume"].to_numpy(dtype=np.float64) | |
| if "quote_volume" in df.columns | |
| else np.zeros(n) | |
| ) | |
| for h in _RETURN_HORIZONS: | |
| w = _tail(qv, h) | |
| out.append(_safe_mean(w)) | |
| out.append(float(np.nansum(w))) | |
| out.append(_safe_std(w)) | |
| # trades stats (18) | |
| for h in _RETURN_HORIZONS: | |
| w = _tail(trades, h) | |
| out.append(_safe_mean(w)) | |
| out.append(float(np.nansum(w))) | |
| out.append(_safe_std(w)) | |
| # taker buy stats (18) | |
| tbb = ( | |
| df["taker_buy_base"].to_numpy(dtype=np.float64) | |
| if "taker_buy_base" in df.columns | |
| else np.zeros(n) | |
| ) | |
| tbq = ( | |
| df["taker_buy_quote"].to_numpy(dtype=np.float64) | |
| if "taker_buy_quote" in df.columns | |
| else np.zeros(n) | |
| ) | |
| for h in _RETURN_HORIZONS: | |
| wb = _tail(tbb, h) | |
| wq = _tail(tbq, h) | |
| wv = _tail(volume, h) | |
| out.append(float(np.nansum(wb))) | |
| out.append(float(np.nansum(wq))) | |
| denom = max(float(np.nansum(wv)), 1e-9) | |
| out.append(float(np.nansum(wb)) / denom) | |
| # price quantiles at 30/60/180 (15) | |
| for h in [30, 60, 180]: | |
| w = _tail(close, h) | |
| if len(w): | |
| qs = np.nanquantile(w, [0.05, 0.25, 0.5, 0.75, 0.95]) | |
| else: | |
| qs = np.zeros(5) | |
| out.extend(float(x) for x in qs) | |
| # autocorr at 60/180 for lags 1/5/15 (6) | |
| for h in [60, 180]: | |
| w = _tail(logrets, h) | |
| out.append(_autocorr(w, 1)) | |
| out.append(_autocorr(w, 5)) | |
| out.append(_autocorr(w, 15)) | |
| # vol & trades z at 60/180 (4) | |
| for h in [60, 180]: | |
| vw = _tail(volume, h) | |
| m, s = _safe_mean(vw), _safe_std(vw) | |
| out.append(((volume[-1] - m) / s) if s > 0 else 0.0) | |
| tw = _tail(trades, h) | |
| m, s = _safe_mean(tw), _safe_std(tw) | |
| out.append(((trades[-1] - m) / s) if s > 0 else 0.0) | |
| # EMA ratios (4) | |
| def _ema_ratio(a, b): | |
| e_a = _ema(close, a) | |
| e_b = _ema(close, b) | |
| return (e_a / e_b - 1.0) if e_b > 0 else 0.0 | |
| out.append(_ema_ratio(5, 30)) | |
| out.append(_ema_ratio(15, 60)) | |
| out.append(_ema_ratio(30, 180)) | |
| out.append(_ema_ratio(60, 180)) | |
| # accel at 5/15/60/180 (4) | |
| for h in [5, 15, 60, 180]: | |
| if n > 2 * h: | |
| r_now = _safe_ret(close, h) | |
| r_prev = ( | |
| float(close[-h - 1] / close[-2 * h - 1] - 1.0) | |
| if close[-2 * h - 1] > 0 | |
| else 0.0 | |
| ) | |
| out.append(r_now - r_prev) | |
| else: | |
| out.append(0.0) | |
| # tick positions (2) | |
| out.append(float(at_tick)) | |
| out.append(float(at_tick) / 900.0) | |
| # HLC patterns at 6 horizons (18) | |
| for h in _RETURN_HORIZONS: | |
| hi = _tail(high, h) | |
| lo = _tail(low, h) | |
| cl = _tail(close, h) | |
| c = cl[-1] if len(cl) else 1.0 | |
| if c == 0 or not np.isfinite(c): | |
| c = 1.0 | |
| hmax = float(np.nanmax(hi)) if len(hi) else c | |
| lmin = float(np.nanmin(lo)) if len(lo) else c | |
| out.append(hmax / c if c else 0.0) | |
| out.append(lmin / c if c else 0.0) | |
| out.append((hmax - lmin) / c if c else 0.0) | |
| # absret skew/kurt at 60/180 (4) | |
| for h in [60, 180]: | |
| w = _tail(np.abs(logrets), h) | |
| out.append(_safe_skew(w)) | |
| out.append(_safe_kurt(w)) | |
| # corr(vol, |ret|) at 60/180 (2) | |
| for h in [60, 180]: | |
| w1 = _tail(volume, h) | |
| w2 = _tail(np.abs(logrets), h) | |
| if len(w1) > 3 and len(w2) > 3 and w1.std() > 0 and w2.std() > 0: | |
| out.append(float(np.corrcoef(w1, w2)[0, 1])) | |
| else: | |
| out.append(0.0) | |
| # ticks since last up / dn (2) | |
| up_idx = np.where(logrets > 0)[0] | |
| dn_idx = np.where(logrets < 0)[0] | |
| out.append(float(n - 1 - up_idx[-1]) if len(up_idx) else float(n)) | |
| out.append(float(n - 1 - dn_idx[-1]) if len(dn_idx) else float(n)) | |
| # max drawdown / runup at 60/180 (4) | |
| for h in [60, 180]: | |
| w = _tail(close, h) | |
| if len(w) < 2: | |
| out.append(0.0) | |
| continue | |
| roll_max = np.maximum.accumulate(w) | |
| out.append(float(np.nanmin(w / roll_max - 1.0))) | |
| for h in [60, 180]: | |
| w = _tail(close, h) | |
| if len(w) < 2: | |
| out.append(0.0) | |
| continue | |
| roll_min = np.minimum.accumulate(w) | |
| out.append(float(np.nanmax(w / roll_min - 1.0))) | |
| # fraction above vwmp at 60/180 (2) | |
| for h in [60, 180]: | |
| w = _tail(close, h) | |
| out.append(float((w > vwmp).mean()) if len(w) else 0.0) | |
| # trend strength proxy at 60/180 (2) | |
| for h in [60, 180]: | |
| w = _tail(close, h) | |
| if len(w) < 3: | |
| out.append(0.0) | |
| continue | |
| x = np.arange(len(w), dtype=np.float64) | |
| # slope via np.polyfit deg=1, rescaled by mean price | |
| try: | |
| slope = float(np.polyfit(x, w, 1)[0]) | |
| except Exception: | |
| slope = 0.0 | |
| m = _safe_mean(w) | |
| out.append(slope / m if m != 0 else 0.0) | |
| # cumulative logret mean/var/skew/kurt over window so far (4) | |
| out.append(_safe_mean(logrets)) | |
| out.append(float(np.nanvar(logrets))) | |
| out.append(_safe_skew(logrets)) | |
| out.append(_safe_kurt(logrets)) | |
| # absret sharpe at 6 horizons (6) | |
| for h in _RETURN_HORIZONS: | |
| w = _tail(np.abs(logrets), h) | |
| m = _safe_mean(w) | |
| s = _safe_std(w) | |
| out.append(m / s if s > 0 else 0.0) | |
| # sortino proxy at 6 horizons (6) — mean / downside_std | |
| for h in _RETURN_HORIZONS: | |
| w = _tail(logrets, h) | |
| m = _safe_mean(w) | |
| dn = w[w < 0] | |
| s = _safe_std(dn) | |
| out.append(m / s if s > 0 else 0.0) | |
| # ret rank at 6 horizons (6) | |
| for h in _RETURN_HORIZONS: | |
| out.append(_rank_of_last(_tail(logrets, h))) | |
| # vol rank at 6 horizons (6) | |
| for h in _RETURN_HORIZONS: | |
| out.append(_rank_of_last(_tail(volume, h))) | |
| # px rank at 6 horizons (6) | |
| for h in _RETURN_HORIZONS: | |
| out.append(_rank_of_last(_tail(close, h))) | |
| # pos frac at 6 horizons (6) | |
| for h in _RETURN_HORIZONS: | |
| w = _tail(logrets, h) | |
| out.append(float((w > 0).mean()) if len(w) else 0.0) | |
| # final 6 summary ratios | |
| r60 = _safe_ret(close, 60) | |
| r180 = _safe_ret(close, 180) | |
| v60 = _safe_mean(_tail(volume, 60)) | |
| v180 = _safe_mean(_tail(volume, 180)) | |
| rng60 = ( | |
| (float(np.nanmax(_tail(high, 60))) - float(np.nanmin(_tail(low, 60)))) | |
| if n >= 60 | |
| else 0.0 | |
| ) | |
| rng180 = ( | |
| (float(np.nanmax(_tail(high, 180))) - float(np.nanmin(_tail(low, 180)))) | |
| if n >= 180 | |
| else 0.0 | |
| ) | |
| t60 = _safe_mean(_tail(trades, 60)) | |
| t180 = _safe_mean(_tail(trades, 180)) | |
| out.append(r60 / r180 if r180 != 0 else 0.0) | |
| out.append(v60 / v180 if v180 != 0 else 0.0) | |
| out.append(rng60 / rng180 if rng180 != 0 else 0.0) | |
| out.append(t60 / t180 if t180 != 0 else 0.0) | |
| out.append((vwmp / close[-1] - 1.0) if close[-1] > 0 else 0.0) | |
| out.append(float(np.sqrt(np.nansum(logrets ** 2)))) | |
| arr = np.asarray(out, dtype=np.float64) | |
| assert arr.shape[0] == 329, f"produced {arr.shape[0]} features, expected 329" | |
| # finalize — replace non-finite with 0 | |
| arr = np.where(np.isfinite(arr), arr, 0.0).astype(np.float32) | |
| return arr | |
| __all__ = ["FEATURE_NAMES", "extract"] | |