# src/features.py # Shared feature engineering — imported by both train.py and predict.py. # Any change to features must happen here and here only. import pandas as pd import numpy as np # Columns that are identifiers, future leaks, or time proxies. # train.py and predict.py both reference this list. DROP_COLS = [ # identifiers "ts", "hhmm", "market_id", "open_mid", # future leakage "swing_high", "max_mid_reached", "price_range", "dist_to_high", "is_last_tick", "outcome", "swing_low", # absolute price level — proxies time period, not signal "btc_price", "btc_open", "prev_open_mid", # replaced by prev_open_mid_delta below # calendar time — regime proxies that don't generalise "hour", "minute_of_hour", # raw OB levels — replaced by aggregates below "bid_s1","bid_s2","bid_s3","bid_s4","bid_s5", "ask_s1","ask_s2","ask_s3","ask_s4","ask_s5", "bid_p1","bid_p2","bid_p3","bid_p4","bid_p5", "ask_p1","ask_p2","ask_p3","ask_p4","ask_p5", ] # Target and group columns — never used as features TARGET_COL = "swing_occurred" GROUP_COL = "market_seq" def build_features(df: pd.DataFrame) -> pd.DataFrame: """ Add engineered columns to df in-place (returns a copy). Call this on raw CSV data before selecting feature_cols. """ df = df.copy() # Aggregate order book sizes → cleaner signal than raw levels df["total_bid_size"] = df[["bid_s1","bid_s2","bid_s3","bid_s4","bid_s5"]].sum(axis=1) df["total_ask_size"] = df[["ask_s1","ask_s2","ask_s3","ask_s4","ask_s5"]].sum(axis=1) df["size_imbalance"] = ( (df["total_bid_size"] - df["total_ask_size"]) / (df["total_bid_size"] + df["total_ask_size"] + 1e-9) ) # OB price slope: how steeply does the book widen on each side? df["bid_ask_slope"] = ( (df["ask_p5"] - df["ask_p1"]) - (df["bid_p1"] - df["bid_p5"]) ) # Relative shift vs previous market open — avoids absolute price drift df["prev_open_mid_delta"] = df["open_mid"] - df["prev_open_mid"] return df def get_entry_snapshot(df: pd.DataFrame) -> pd.DataFrame: """ From a full tick-level DataFrame, return one row per market: the first tick where mid enters the entry zone (mid <= 0.35). This is the prediction point — what the model sees at trade time. """ df_entry = df[df["mid"] <= 0.35].copy() df_model = ( df_entry .sort_values([GROUP_COL, "secs_norm"], ascending=[True, False]) .groupby(GROUP_COL).first() .reset_index() ) return df_model def get_feature_cols(df: pd.DataFrame) -> list: """ Return the list of feature columns for a given DataFrame, after DROP_COLS and target/group columns have been excluded. """ exclude = set(DROP_COLS + [TARGET_COL, GROUP_COL]) return [c for c in df.columns if c not in exclude]