""" State / time / arb-awareness features for ARB-MAX. 15 features, fixed order. """ from __future__ import annotations import math from typing import List import numpy as np import pandas as pd FEATURE_NAMES: List[str] = [ "tick_norm", # at_tick / 900 "seconds_remaining", # 900 - at_tick "log_seconds_remaining", # log(900 - at_tick + 1) "min_combined_cost_so_far", # min over ticks 0..at_tick of up_ask1+dn_ask1 "current_best_arb_opportunity", # min_combined_cost - 1.0 "leg_side_stub", # 0 at feature-extraction time "leg_cost_stub", # 0 "leg_age_stub", # 0 "required_hedge_px_for_be_up", # 1.0 - up_ask_now - fee(up_ask_now) "required_hedge_px_for_be_dn", # 1.0 - dn_ask_now - fee(dn_ask_now) "time_since_best_combined_cost_seen", # ticks since min combined cost "has_potential_arb_been_available_yet", # 1 if any min_combined_so_far < 1 "current_combined_cost", # up_ask_now + dn_ask_now "combined_cost_minus_min", # current - min so far "arb_potential_rank_in_window", # rank of current combined cost among history ] assert len(FEATURE_NAMES) == 15 def _fee_shares(p: float, shares: int = 100) -> float: if not np.isfinite(p) or p <= 0 or p >= 1: return 0.0 return 0.072 * p * (1.0 - p) * shares def _col(df: pd.DataFrame, name: str) -> np.ndarray: if name in df.columns: a = df[name].to_numpy(dtype=np.float64) else: a = np.full(len(df), np.nan, dtype=np.float64) return a def _ff(a: np.ndarray) -> np.ndarray: out = a.copy() last = np.nan for i, v in enumerate(out): if np.isfinite(v): last = v else: out[i] = last if not np.isfinite(out[0]): first = np.nan for v in out: if np.isfinite(v): first = v break if np.isfinite(first): for i in range(len(out)): if np.isfinite(out[i]): break out[i] = first return np.nan_to_num(out, nan=0.0, posinf=0.0, neginf=0.0) def extract(window_frame: pd.DataFrame, at_tick: int = 120) -> np.ndarray: df = window_frame.iloc[: at_tick + 1] n = len(df) up_ask = _ff(_col(df, "pm_up_ask_px_1")) dn_ask = _ff(_col(df, "pm_dn_ask_px_1")) combined = up_ask + dn_ask tick_norm = float(at_tick) / 900.0 seconds_remaining = float(900 - at_tick) log_seconds_remaining = float(math.log(max(seconds_remaining + 1.0, 1.0))) if len(combined) > 0: min_combined = float(np.nanmin(combined)) argmin_idx = int(np.nanargmin(combined)) else: min_combined = 0.0 argmin_idx = 0 current_best_arb = min_combined - 1.0 # per spec, these are stubs since we don't carry an open leg during feature extract leg_side_stub = 0.0 leg_cost_stub = 0.0 leg_age_stub = 0.0 up_ask_now = float(up_ask[-1]) if len(up_ask) else 0.0 dn_ask_now = float(dn_ask[-1]) if len(dn_ask) else 0.0 # hedge-to-breakeven: buy UP at up_ask_now, need DN to cost <= 1 - up_ask_now - total_fee_share # total fee per share ~ fee(up) + fee(hedge) / shares. Approximate fee as entry-only for simplicity. fee_up = _fee_shares(up_ask_now) / 100.0 fee_dn = _fee_shares(dn_ask_now) / 100.0 req_hedge_up = 1.0 - up_ask_now - fee_up req_hedge_dn = 1.0 - dn_ask_now - fee_dn time_since_best = float(at_tick - argmin_idx) has_arb_yet = 1.0 if min_combined < 1.0 else 0.0 current_combined = float(combined[-1]) if len(combined) else 0.0 combined_minus_min = current_combined - min_combined if len(combined) > 1: rank = float((combined <= current_combined).mean()) else: rank = 0.5 out = np.array( [ tick_norm, seconds_remaining, log_seconds_remaining, min_combined, current_best_arb, leg_side_stub, leg_cost_stub, leg_age_stub, req_hedge_up, req_hedge_dn, time_since_best, has_arb_yet, current_combined, combined_minus_min, rank, ], dtype=np.float64, ) out = np.where(np.isfinite(out), out, 0.0).astype(np.float32) assert out.shape[0] == 15 return out __all__ = ["FEATURE_NAMES", "extract"]