| |
| |
| |
|
|
| import pandas as pd |
| import numpy as np |
|
|
| |
| |
| DROP_COLS = [ |
| |
| "ts", "hhmm", "market_id", "open_mid", |
| |
| "swing_high", "max_mid_reached", "price_range", |
| "dist_to_high", "is_last_tick", "outcome", "swing_low", |
| |
| "btc_price", "btc_open", |
| "prev_open_mid", |
| |
| "hour", "minute_of_hour", |
| |
| "bid_s1","bid_s2","bid_s3","bid_s4","bid_s5", |
| "ask_s1","ask_s2","ask_s3","ask_s4","ask_s5", |
| "bid_p1","bid_p2","bid_p3","bid_p4","bid_p5", |
| "ask_p1","ask_p2","ask_p3","ask_p4","ask_p5", |
| ] |
|
|
| |
| TARGET_COL = "swing_occurred" |
| GROUP_COL = "market_seq" |
|
|
|
|
| def build_features(df: pd.DataFrame) -> pd.DataFrame: |
| """ |
| Add engineered columns to df in-place (returns a copy). |
| Call this on raw CSV data before selecting feature_cols. |
| """ |
| df = df.copy() |
|
|
| |
| df["total_bid_size"] = df[["bid_s1","bid_s2","bid_s3","bid_s4","bid_s5"]].sum(axis=1) |
| df["total_ask_size"] = df[["ask_s1","ask_s2","ask_s3","ask_s4","ask_s5"]].sum(axis=1) |
| df["size_imbalance"] = ( |
| (df["total_bid_size"] - df["total_ask_size"]) / |
| (df["total_bid_size"] + df["total_ask_size"] + 1e-9) |
| ) |
|
|
| |
| df["bid_ask_slope"] = ( |
| (df["ask_p5"] - df["ask_p1"]) - |
| (df["bid_p1"] - df["bid_p5"]) |
| ) |
|
|
| |
| df["prev_open_mid_delta"] = df["open_mid"] - df["prev_open_mid"] |
|
|
| return df |
|
|
|
|
| def get_entry_snapshot(df: pd.DataFrame) -> pd.DataFrame: |
| """ |
| From a full tick-level DataFrame, return one row per market: |
| the first tick where mid enters the entry zone (mid <= 0.35). |
| This is the prediction point β what the model sees at trade time. |
| """ |
| df_entry = df[df["mid"] <= 0.35].copy() |
| df_model = ( |
| df_entry |
| .sort_values([GROUP_COL, "secs_norm"], ascending=[True, False]) |
| .groupby(GROUP_COL).first() |
| .reset_index() |
| ) |
| return df_model |
|
|
|
|
| def get_feature_cols(df: pd.DataFrame) -> list: |
| """ |
| Return the list of feature columns for a given DataFrame, |
| after DROP_COLS and target/group columns have been excluded. |
| """ |
| exclude = set(DROP_COLS + [TARGET_COL, GROUP_COL]) |
| return [c for c in df.columns if c not in exclude] |
|
|