File size: 2,904 Bytes
1f59303 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 | # src/features.py
# Shared feature engineering — imported by both train.py and predict.py.
# Any change to features must happen here and here only.
import pandas as pd
import numpy as np
# Columns that are identifiers, future leaks, or time proxies.
# train.py and predict.py both reference this list.
DROP_COLS = [
# identifiers
"ts", "hhmm", "market_id", "open_mid",
# future leakage
"swing_high", "max_mid_reached", "price_range",
"dist_to_high", "is_last_tick", "outcome", "swing_low",
# absolute price level — proxies time period, not signal
"btc_price", "btc_open",
"prev_open_mid", # replaced by prev_open_mid_delta below
# calendar time — regime proxies that don't generalise
"hour", "minute_of_hour",
# raw OB levels — replaced by aggregates below
"bid_s1","bid_s2","bid_s3","bid_s4","bid_s5",
"ask_s1","ask_s2","ask_s3","ask_s4","ask_s5",
"bid_p1","bid_p2","bid_p3","bid_p4","bid_p5",
"ask_p1","ask_p2","ask_p3","ask_p4","ask_p5",
]
# Target and group columns — never used as features
TARGET_COL = "swing_occurred"
GROUP_COL = "market_seq"
def build_features(df: pd.DataFrame) -> pd.DataFrame:
"""
Add engineered columns to df in-place (returns a copy).
Call this on raw CSV data before selecting feature_cols.
"""
df = df.copy()
# Aggregate order book sizes → cleaner signal than raw levels
df["total_bid_size"] = df[["bid_s1","bid_s2","bid_s3","bid_s4","bid_s5"]].sum(axis=1)
df["total_ask_size"] = df[["ask_s1","ask_s2","ask_s3","ask_s4","ask_s5"]].sum(axis=1)
df["size_imbalance"] = (
(df["total_bid_size"] - df["total_ask_size"]) /
(df["total_bid_size"] + df["total_ask_size"] + 1e-9)
)
# OB price slope: how steeply does the book widen on each side?
df["bid_ask_slope"] = (
(df["ask_p5"] - df["ask_p1"]) -
(df["bid_p1"] - df["bid_p5"])
)
# Relative shift vs previous market open — avoids absolute price drift
df["prev_open_mid_delta"] = df["open_mid"] - df["prev_open_mid"]
return df
def get_entry_snapshot(df: pd.DataFrame) -> pd.DataFrame:
"""
From a full tick-level DataFrame, return one row per market:
the first tick where mid enters the entry zone (mid <= 0.35).
This is the prediction point — what the model sees at trade time.
"""
df_entry = df[df["mid"] <= 0.35].copy()
df_model = (
df_entry
.sort_values([GROUP_COL, "secs_norm"], ascending=[True, False])
.groupby(GROUP_COL).first()
.reset_index()
)
return df_model
def get_feature_cols(df: pd.DataFrame) -> list:
"""
Return the list of feature columns for a given DataFrame,
after DROP_COLS and target/group columns have been excluded.
"""
exclude = set(DROP_COLS + [TARGET_COL, GROUP_COL])
return [c for c in df.columns if c not in exclude]
|