""" OHLCV feature extraction for ARB-MAX. 329 features, hardcoded order. extract(window_frame, at_tick=120) -> np.ndarray shape (329,) Uses rolling statistics over ticks [0, at_tick]. Robust to NaN: anything non-finite is replaced with 0.0 before return. """ from __future__ import annotations from typing import List import numpy as np import pandas as pd # --------------------------------------------------------------------------- # Horizon lists # --------------------------------------------------------------------------- _RETURN_HORIZONS = [1, 5, 15, 30, 60, 180] _STD_HORIZONS = [1, 5, 15, 30, 60, 180] _SHARPE_HORIZONS = [1, 5, 15, 30, 60, 180] _MOMENT_HORIZONS = [60, 180] _RSI_HORIZONS = [30, 60, 180] _BOLL_HORIZONS = [30, 60, 180] _VOL_HORIZONS = [30, 60, 180] def _build_feature_names() -> List[str]: names: List[str] = [] # returns (6) for h in _RETURN_HORIZONS: names.append(f"ret_{h}s") # rolling std (6) for h in _STD_HORIZONS: names.append(f"retstd_{h}s") # rolling sharpe (6) for h in _SHARPE_HORIZONS: names.append(f"sharpe_{h}s") # higher moments (4) — skew & kurt at 60s/180s for h in _MOMENT_HORIZONS: names.append(f"retskew_{h}s") names.append(f"retkurt_{h}s") # VWMP & VW return (2) names.append("vwmp") names.append("vw_ret") # MACD (3) names.append("macd") names.append("macd_signal") names.append("macd_hist") # RSI at 3 horizons (3) for h in _RSI_HORIZONS: names.append(f"rsi_{h}s") # Bollinger position at 3 horizons (3*3 = 9) for h in _BOLL_HORIZONS: names.append(f"boll_pos_{h}s") names.append(f"boll_width_{h}s") names.append(f"boll_z_{h}s") # Cumulative return since t=0 (1) names.append("cum_ret_since_0") # Price vs window_open ratio (1) names.append("px_over_open") # Trade intensity & volume accel (3 * len(_VOL_HORIZONS) = 9) for h in _VOL_HORIZONS: names.append(f"trade_rate_{h}s") names.append(f"vol_accel_{h}s") names.append(f"taker_buy_ratio_{h}s") # --- So far: 6+6+6+4+2+3+3+9+1+1+9 = 50 --- # Now pad with derived / finer features up to 329. # Rolling min / max / mean of returns at all horizons (3 * 6 = 18) for h in _RETURN_HORIZONS: names.append(f"retmin_{h}s") names.append(f"retmax_{h}s") names.append(f"retmean_{h}s") # running total: 50 + 18 = 68 # Rolling min / max / mean of close prices at all horizons (3 * 6 = 18) for h in _RETURN_HORIZONS: names.append(f"pxmin_{h}s") names.append(f"pxmax_{h}s") names.append(f"pxmean_{h}s") # total: 86 # Rolling range / ATR-like at horizons (2 * 6 = 12) for h in _RETURN_HORIZONS: names.append(f"range_{h}s") names.append(f"atr_{h}s") # total: 98 # Log return statistics (4 * 6 = 24) for h in _RETURN_HORIZONS: names.append(f"logret_mean_{h}s") names.append(f"logret_std_{h}s") names.append(f"logret_abs_sum_{h}s") names.append(f"logret_sign_sum_{h}s") # total: 122 # Up / down move counts (2 * 6 = 12) for h in _RETURN_HORIZONS: names.append(f"up_count_{h}s") names.append(f"dn_count_{h}s") # total: 134 # Consecutive-direction run lengths (2) names.append("run_length_up") names.append("run_length_dn") # total: 136 # Volume stats (4 * 6 = 24) for h in _RETURN_HORIZONS: names.append(f"vol_mean_{h}s") names.append(f"vol_std_{h}s") names.append(f"vol_max_{h}s") names.append(f"vol_sum_{h}s") # total: 160 # Quote volume stats (3 * 6 = 18) for h in _RETURN_HORIZONS: names.append(f"qvol_mean_{h}s") names.append(f"qvol_sum_{h}s") names.append(f"qvol_std_{h}s") # total: 178 # Trade count stats (3 * 6 = 18) for h in _RETURN_HORIZONS: names.append(f"trades_mean_{h}s") names.append(f"trades_sum_{h}s") names.append(f"trades_std_{h}s") # total: 196 # Taker buy stats (3 * 6 = 18) for h in _RETURN_HORIZONS: names.append(f"tbbase_sum_{h}s") names.append(f"tbquote_sum_{h}s") names.append(f"tb_imbalance_{h}s") # total: 214 # Price-level quantile features at longer horizons (5 * 3 = 15) for h in [30, 60, 180]: names.append(f"px_q05_{h}s") names.append(f"px_q25_{h}s") names.append(f"px_q50_{h}s") names.append(f"px_q75_{h}s") names.append(f"px_q95_{h}s") # total: 229 # Return autocorrelation at lags 1/5/15 for horizons 60/180 (2 * 3 = 6) for h in [60, 180]: names.append(f"autocorr1_{h}s") names.append(f"autocorr5_{h}s") names.append(f"autocorr15_{h}s") # total: 235 # Rolling z-scores of volume & trades at 60/180 (4) for h in [60, 180]: names.append(f"vol_z_{h}s") names.append(f"trades_z_{h}s") # total: 239 # Momentum crossovers (EMA fast/slow ratio) at 4 pairs (4) names.append("ema5_over_ema30") names.append("ema15_over_ema60") names.append("ema30_over_ema180") names.append("ema60_over_ema180") # total: 243 # Acceleration features (2nd-diff of return) at 4 horizons (4) for h in [5, 15, 60, 180]: names.append(f"accel_{h}s") # total: 247 # Hour-of-day and minute-of-hour are part of state features — skip here. # Tick position names.append("at_tick") names.append("at_tick_over_900") # total: 249 # High-low-close patterns (HLC) at horizons (3 * 6 = 18) for h in _RETURN_HORIZONS: names.append(f"high_over_close_{h}s") names.append(f"low_over_close_{h}s") names.append(f"hl_range_over_close_{h}s") # total: 267 # Rolling skew / kurt of ABS returns at 60/180 (4) for h in [60, 180]: names.append(f"absret_skew_{h}s") names.append(f"absret_kurt_{h}s") # total: 271 # Rolling correlation of volume & |ret| at 60/180 (2) for h in [60, 180]: names.append(f"corr_vol_absret_{h}s") # total: 273 # Ticks since last up / down move (2) names.append("ticks_since_up") names.append("ticks_since_dn") # total: 275 # Rolling maximum drawdown of close at 60/180 (2) for h in [60, 180]: names.append(f"max_dd_{h}s") # total: 277 # Rolling maximum run-up at 60/180 (2) for h in [60, 180]: names.append(f"max_ru_{h}s") # total: 279 # Binary "above VWMP" count at 60/180 (2) for h in [60, 180]: names.append(f"frac_above_vwmp_{h}s") # total: 281 # ADX-ish trend strength proxies at 60/180 (2) for h in [60, 180]: names.append(f"trend_strength_{h}s") # total: 283 # Log-return cumulants (mean, var, skew, kurt) on full window so far (4) names.append("cum_logret_mean") names.append("cum_logret_var") names.append("cum_logret_skew") names.append("cum_logret_kurt") # total: 287 # Per-horizon Sharpe of abs-returns (6) for h in _RETURN_HORIZONS: names.append(f"absret_sharpe_{h}s") # total: 293 # Per-horizon Sortino proxy (downside std only) (6) for h in _RETURN_HORIZONS: names.append(f"sortino_{h}s") # total: 299 # Rolling ranks of latest ret within horizon (6) for h in _RETURN_HORIZONS: names.append(f"ret_rank_{h}s") # total: 305 # Rolling ranks of latest volume within horizon (6) for h in _RETURN_HORIZONS: names.append(f"vol_rank_{h}s") # total: 311 # Rolling ranks of latest price within horizon (6) for h in _RETURN_HORIZONS: names.append(f"px_rank_{h}s") # total: 317 # Rolling fraction of positive returns at all horizons (6) for h in _RETURN_HORIZONS: names.append(f"pos_frac_{h}s") # total: 323 # Final six: coarse momentum / vol summary names.append("mom_60_180_ratio") names.append("vol_60_180_ratio") names.append("range_60_180_ratio") names.append("trades_60_180_ratio") names.append("vwmp_vs_close_pct") names.append("realized_vol_full") # total: 329 return names FEATURE_NAMES: List[str] = _build_feature_names() assert len(FEATURE_NAMES) == 329, f"expected 329, got {len(FEATURE_NAMES)}" # --------------------------------------------------------------------------- # Extraction helpers # --------------------------------------------------------------------------- def _tail(arr: np.ndarray, n: int) -> np.ndarray: if n <= 0: return arr[-0:] return arr[-n:] if len(arr) >= n else arr def _safe_ret(series: np.ndarray, h: int) -> float: if len(series) <= h: return 0.0 base = series[-h - 1] last = series[-1] if not np.isfinite(base) or base == 0 or not np.isfinite(last): return 0.0 return float(last / base - 1.0) def _safe_std(arr: np.ndarray) -> float: if len(arr) < 2: return 0.0 v = float(np.nanstd(arr)) return v if np.isfinite(v) else 0.0 def _safe_mean(arr: np.ndarray) -> float: if len(arr) == 0: return 0.0 v = float(np.nanmean(arr)) return v if np.isfinite(v) else 0.0 def _safe_skew(arr: np.ndarray) -> float: arr = arr[np.isfinite(arr)] if len(arr) < 3: return 0.0 m = arr.mean() s = arr.std() if s == 0: return 0.0 return float(((arr - m) ** 3).mean() / (s ** 3)) def _safe_kurt(arr: np.ndarray) -> float: arr = arr[np.isfinite(arr)] if len(arr) < 4: return 0.0 m = arr.mean() s = arr.std() if s == 0: return 0.0 return float(((arr - m) ** 4).mean() / (s ** 4) - 3.0) def _ema(arr: np.ndarray, span: int) -> float: if len(arr) == 0: return 0.0 alpha = 2.0 / (span + 1.0) out = arr[0] for v in arr[1:]: if not np.isfinite(v): continue out = alpha * v + (1.0 - alpha) * out return float(out) def _rsi(prices: np.ndarray, n: int) -> float: if len(prices) <= n: return 50.0 diffs = np.diff(prices[-(n + 1):]) gains = diffs[diffs > 0].sum() losses = -diffs[diffs < 0].sum() if losses == 0: return 100.0 if gains > 0 else 50.0 rs = gains / losses return float(100.0 - 100.0 / (1.0 + rs)) def _rank_of_last(arr: np.ndarray) -> float: arr = arr[np.isfinite(arr)] if len(arr) == 0: return 0.5 last = arr[-1] return float((arr <= last).mean()) def _autocorr(arr: np.ndarray, lag: int) -> float: if len(arr) <= lag + 1: return 0.0 a = arr[:-lag] b = arr[lag:] mask = np.isfinite(a) & np.isfinite(b) if mask.sum() < 3: return 0.0 a = a[mask] b = b[mask] sa, sb = a.std(), b.std() if sa == 0 or sb == 0: return 0.0 return float(np.corrcoef(a, b)[0, 1]) # --------------------------------------------------------------------------- # Main extract # --------------------------------------------------------------------------- def extract(window_frame: pd.DataFrame, at_tick: int = 120) -> np.ndarray: """Produce 329 features aligned with FEATURE_NAMES.""" df = window_frame.iloc[: at_tick + 1] n = len(df) close = df["close"].to_numpy(dtype=np.float64) open_ = df["open"].to_numpy(dtype=np.float64) high = df["high"].to_numpy(dtype=np.float64) low = df["low"].to_numpy(dtype=np.float64) volume = df["volume"].to_numpy(dtype=np.float64) if "volume" in df.columns else np.zeros(n) trades = df["trades"].to_numpy(dtype=np.float64) if "trades" in df.columns else np.zeros(n) # Forward-fill close gaps for return math (in-case first ticks missing) if not np.all(np.isfinite(close)): last = np.nan for i, v in enumerate(close): if np.isfinite(v): last = v else: close[i] = last # if still nan at start, use the first finite if np.isnan(close[0]): for v in close: if np.isfinite(v): close[:] = np.where(np.isfinite(close), close, v) break close = np.nan_to_num(close, nan=0.0, posinf=0.0, neginf=0.0) # log returns with np.errstate(divide="ignore", invalid="ignore"): log_close = np.where(close > 0, np.log(close), 0.0) logrets = np.diff(log_close, prepend=log_close[0]) # first entry is 0 out: List[float] = [] # ret_{h}s (6) for h in _RETURN_HORIZONS: out.append(_safe_ret(close, h)) # retstd_{h}s (6) — on log-returns window for h in _STD_HORIZONS: out.append(_safe_std(_tail(logrets, h))) # sharpe_{h}s (6) for h in _SHARPE_HORIZONS: w = _tail(logrets, h) m = _safe_mean(w) s = _safe_std(w) out.append(m / s if s > 0 else 0.0) # higher moments (4): skew, kurt at 60/180 for h in _MOMENT_HORIZONS: w = _tail(logrets, h) out.append(_safe_skew(w)) out.append(_safe_kurt(w)) # VWMP & VW return (2) if volume.sum() > 0: vwmp = float(np.nansum(close * volume) / max(np.nansum(volume), 1e-9)) else: vwmp = _safe_mean(close) out.append(vwmp) out.append((close[-1] / vwmp - 1.0) if vwmp > 0 else 0.0) # MACD (3) ema12 = _ema(close, 12) ema26 = _ema(close, 26) macd = ema12 - ema26 # signal = ema9 of macd history — approximate over last 9 ticks macd_hist_series = [] for k in range(min(26, n), n + 1): sub = close[:k] macd_hist_series.append(_ema(sub, 12) - _ema(sub, 26)) macd_signal = _ema(np.array(macd_hist_series[-9:]) if macd_hist_series else np.array([0.0]), 9) out.append(macd) out.append(macd_signal) out.append(macd - macd_signal) # RSI (3) for h in _RSI_HORIZONS: out.append(_rsi(close, h)) # Bollinger position (9): pos, width, z-score at 30/60/180 for h in _BOLL_HORIZONS: w = _tail(close, h) m = _safe_mean(w) s = _safe_std(w) mn = float(np.nanmin(w)) if len(w) else 0.0 mx = float(np.nanmax(w)) if len(w) else 0.0 denom = (mx - mn) if (mx - mn) != 0 else 1.0 pos = (close[-1] - mn) / denom width = (mx - mn) / m if m != 0 else 0.0 z = (close[-1] - m) / s if s > 0 else 0.0 out.append(pos) out.append(width) out.append(z) # Cumulative return since t=0 (1) out.append((close[-1] / close[0] - 1.0) if close[0] > 0 else 0.0) # Price vs window_open ratio (1) wopen = open_[0] if np.isfinite(open_[0]) and open_[0] > 0 else close[0] out.append((close[-1] / wopen - 1.0) if wopen > 0 else 0.0) # Trade intensity + volume accel + taker buy ratio (9) for h in _VOL_HORIZONS: tw = _tail(trades, h) vw = _tail(volume, h) out.append(_safe_mean(tw)) # accel = latest vol vs mean vol mv = _safe_mean(vw) out.append((volume[-1] - mv) if np.isfinite(volume[-1]) else 0.0) # taker_buy_ratio — stub 0.5 (not always populated on df at this tick) out.append(0.5) # -------- Padded features --------- # retmin/max/mean at 6 horizons (18) for h in _RETURN_HORIZONS: w = _tail(logrets, h) out.append(float(np.nanmin(w)) if len(w) else 0.0) out.append(float(np.nanmax(w)) if len(w) else 0.0) out.append(_safe_mean(w)) # pxmin/max/mean at 6 horizons (18) for h in _RETURN_HORIZONS: w = _tail(close, h) out.append(float(np.nanmin(w)) if len(w) else 0.0) out.append(float(np.nanmax(w)) if len(w) else 0.0) out.append(_safe_mean(w)) # range/atr at 6 horizons (12) for h in _RETURN_HORIZONS: hi = _tail(high, h) lo = _tail(low, h) rng = (float(np.nanmax(hi)) - float(np.nanmin(lo))) if len(hi) else 0.0 atr = _safe_mean(hi - lo) if len(hi) == len(lo) else 0.0 out.append(rng) out.append(atr) # log-ret stats (24) for h in _RETURN_HORIZONS: w = _tail(logrets, h) out.append(_safe_mean(w)) out.append(_safe_std(w)) out.append(float(np.nansum(np.abs(w)))) out.append(float(np.nansum(np.sign(w)))) # up/down counts (12) for h in _RETURN_HORIZONS: w = _tail(logrets, h) out.append(float((w > 0).sum())) out.append(float((w < 0).sum())) # run lengths (2) sign = np.sign(logrets) up_run = 0 dn_run = 0 for v in sign[::-1]: if v > 0: if dn_run == 0: up_run += 1 else: break elif v < 0: if up_run == 0: dn_run += 1 else: break else: break out.append(float(up_run)) out.append(float(dn_run)) # volume stats (24) for h in _RETURN_HORIZONS: w = _tail(volume, h) out.append(_safe_mean(w)) out.append(_safe_std(w)) out.append(float(np.nanmax(w)) if len(w) else 0.0) out.append(float(np.nansum(w))) # quote volume stats (18) — if column present qv = ( df["quote_volume"].to_numpy(dtype=np.float64) if "quote_volume" in df.columns else np.zeros(n) ) for h in _RETURN_HORIZONS: w = _tail(qv, h) out.append(_safe_mean(w)) out.append(float(np.nansum(w))) out.append(_safe_std(w)) # trades stats (18) for h in _RETURN_HORIZONS: w = _tail(trades, h) out.append(_safe_mean(w)) out.append(float(np.nansum(w))) out.append(_safe_std(w)) # taker buy stats (18) tbb = ( df["taker_buy_base"].to_numpy(dtype=np.float64) if "taker_buy_base" in df.columns else np.zeros(n) ) tbq = ( df["taker_buy_quote"].to_numpy(dtype=np.float64) if "taker_buy_quote" in df.columns else np.zeros(n) ) for h in _RETURN_HORIZONS: wb = _tail(tbb, h) wq = _tail(tbq, h) wv = _tail(volume, h) out.append(float(np.nansum(wb))) out.append(float(np.nansum(wq))) denom = max(float(np.nansum(wv)), 1e-9) out.append(float(np.nansum(wb)) / denom) # price quantiles at 30/60/180 (15) for h in [30, 60, 180]: w = _tail(close, h) if len(w): qs = np.nanquantile(w, [0.05, 0.25, 0.5, 0.75, 0.95]) else: qs = np.zeros(5) out.extend(float(x) for x in qs) # autocorr at 60/180 for lags 1/5/15 (6) for h in [60, 180]: w = _tail(logrets, h) out.append(_autocorr(w, 1)) out.append(_autocorr(w, 5)) out.append(_autocorr(w, 15)) # vol & trades z at 60/180 (4) for h in [60, 180]: vw = _tail(volume, h) m, s = _safe_mean(vw), _safe_std(vw) out.append(((volume[-1] - m) / s) if s > 0 else 0.0) tw = _tail(trades, h) m, s = _safe_mean(tw), _safe_std(tw) out.append(((trades[-1] - m) / s) if s > 0 else 0.0) # EMA ratios (4) def _ema_ratio(a, b): e_a = _ema(close, a) e_b = _ema(close, b) return (e_a / e_b - 1.0) if e_b > 0 else 0.0 out.append(_ema_ratio(5, 30)) out.append(_ema_ratio(15, 60)) out.append(_ema_ratio(30, 180)) out.append(_ema_ratio(60, 180)) # accel at 5/15/60/180 (4) for h in [5, 15, 60, 180]: if n > 2 * h: r_now = _safe_ret(close, h) r_prev = ( float(close[-h - 1] / close[-2 * h - 1] - 1.0) if close[-2 * h - 1] > 0 else 0.0 ) out.append(r_now - r_prev) else: out.append(0.0) # tick positions (2) out.append(float(at_tick)) out.append(float(at_tick) / 900.0) # HLC patterns at 6 horizons (18) for h in _RETURN_HORIZONS: hi = _tail(high, h) lo = _tail(low, h) cl = _tail(close, h) c = cl[-1] if len(cl) else 1.0 if c == 0 or not np.isfinite(c): c = 1.0 hmax = float(np.nanmax(hi)) if len(hi) else c lmin = float(np.nanmin(lo)) if len(lo) else c out.append(hmax / c if c else 0.0) out.append(lmin / c if c else 0.0) out.append((hmax - lmin) / c if c else 0.0) # absret skew/kurt at 60/180 (4) for h in [60, 180]: w = _tail(np.abs(logrets), h) out.append(_safe_skew(w)) out.append(_safe_kurt(w)) # corr(vol, |ret|) at 60/180 (2) for h in [60, 180]: w1 = _tail(volume, h) w2 = _tail(np.abs(logrets), h) if len(w1) > 3 and len(w2) > 3 and w1.std() > 0 and w2.std() > 0: out.append(float(np.corrcoef(w1, w2)[0, 1])) else: out.append(0.0) # ticks since last up / dn (2) up_idx = np.where(logrets > 0)[0] dn_idx = np.where(logrets < 0)[0] out.append(float(n - 1 - up_idx[-1]) if len(up_idx) else float(n)) out.append(float(n - 1 - dn_idx[-1]) if len(dn_idx) else float(n)) # max drawdown / runup at 60/180 (4) for h in [60, 180]: w = _tail(close, h) if len(w) < 2: out.append(0.0) continue roll_max = np.maximum.accumulate(w) out.append(float(np.nanmin(w / roll_max - 1.0))) for h in [60, 180]: w = _tail(close, h) if len(w) < 2: out.append(0.0) continue roll_min = np.minimum.accumulate(w) out.append(float(np.nanmax(w / roll_min - 1.0))) # fraction above vwmp at 60/180 (2) for h in [60, 180]: w = _tail(close, h) out.append(float((w > vwmp).mean()) if len(w) else 0.0) # trend strength proxy at 60/180 (2) for h in [60, 180]: w = _tail(close, h) if len(w) < 3: out.append(0.0) continue x = np.arange(len(w), dtype=np.float64) # slope via np.polyfit deg=1, rescaled by mean price try: slope = float(np.polyfit(x, w, 1)[0]) except Exception: slope = 0.0 m = _safe_mean(w) out.append(slope / m if m != 0 else 0.0) # cumulative logret mean/var/skew/kurt over window so far (4) out.append(_safe_mean(logrets)) out.append(float(np.nanvar(logrets))) out.append(_safe_skew(logrets)) out.append(_safe_kurt(logrets)) # absret sharpe at 6 horizons (6) for h in _RETURN_HORIZONS: w = _tail(np.abs(logrets), h) m = _safe_mean(w) s = _safe_std(w) out.append(m / s if s > 0 else 0.0) # sortino proxy at 6 horizons (6) — mean / downside_std for h in _RETURN_HORIZONS: w = _tail(logrets, h) m = _safe_mean(w) dn = w[w < 0] s = _safe_std(dn) out.append(m / s if s > 0 else 0.0) # ret rank at 6 horizons (6) for h in _RETURN_HORIZONS: out.append(_rank_of_last(_tail(logrets, h))) # vol rank at 6 horizons (6) for h in _RETURN_HORIZONS: out.append(_rank_of_last(_tail(volume, h))) # px rank at 6 horizons (6) for h in _RETURN_HORIZONS: out.append(_rank_of_last(_tail(close, h))) # pos frac at 6 horizons (6) for h in _RETURN_HORIZONS: w = _tail(logrets, h) out.append(float((w > 0).mean()) if len(w) else 0.0) # final 6 summary ratios r60 = _safe_ret(close, 60) r180 = _safe_ret(close, 180) v60 = _safe_mean(_tail(volume, 60)) v180 = _safe_mean(_tail(volume, 180)) rng60 = ( (float(np.nanmax(_tail(high, 60))) - float(np.nanmin(_tail(low, 60)))) if n >= 60 else 0.0 ) rng180 = ( (float(np.nanmax(_tail(high, 180))) - float(np.nanmin(_tail(low, 180)))) if n >= 180 else 0.0 ) t60 = _safe_mean(_tail(trades, 60)) t180 = _safe_mean(_tail(trades, 180)) out.append(r60 / r180 if r180 != 0 else 0.0) out.append(v60 / v180 if v180 != 0 else 0.0) out.append(rng60 / rng180 if rng180 != 0 else 0.0) out.append(t60 / t180 if t180 != 0 else 0.0) out.append((vwmp / close[-1] - 1.0) if close[-1] > 0 else 0.0) out.append(float(np.sqrt(np.nansum(logrets ** 2)))) arr = np.asarray(out, dtype=np.float64) assert arr.shape[0] == 329, f"produced {arr.shape[0]} features, expected 329" # finalize — replace non-finite with 0 arr = np.where(np.isfinite(arr), arr, 0.0).astype(np.float32) return arr __all__ = ["FEATURE_NAMES", "extract"]