| |
| |
| |
| |
| |
| |
| |
| |
|
|
| import sys |
| from pathlib import Path |
| import pandas as pd |
|
|
| ROOT = Path(__file__).parent.parent |
| DATA_DIR = ROOT / "data" / "raw" |
| SRC_FILE = DATA_DIR / "swing_last.csv" |
|
|
| |
| hours_from = int(sys.argv[1]) if len(sys.argv) > 1 else 24 |
| hours_to = int(sys.argv[2]) if len(sys.argv) > 2 else 0 |
|
|
| label = f"last{hours_from}h" if hours_to == 0 else f"{hours_from}h_to_{hours_to}h" |
| OUT_FILE = DATA_DIR / f"swing_{label}.csv" |
|
|
| print(f"reading {SRC_FILE} ...") |
| df = pd.read_csv(SRC_FILE) |
|
|
| ts_max = df["ts"].max() |
| ts_older = ts_max - hours_from * 3600 |
| ts_newer = ts_max - hours_to * 3600 |
|
|
| df_window = df[(df["ts"] >= ts_older) & (df["ts"] < ts_newer)].copy() |
|
|
| n_markets = df_window["market_seq"].nunique() |
| t_start = pd.to_datetime(df_window["ts"].min(), unit="s").strftime("%Y-%m-%d %H:%M UTC") |
| t_end = pd.to_datetime(df_window["ts"].max(), unit="s").strftime("%Y-%m-%d %H:%M UTC") |
|
|
| print(f"window : {t_start} β {t_end} ({hours_from}h ago β {hours_to}h ago)") |
| print(f"markets : {n_markets}") |
| print(f"rows : {len(df_window)}") |
| print(f"swing rate: {df_window['swing_occurred'].mean():.1%}") |
|
|
| |
| all_seqs = sorted(df["market_seq"].unique()) |
| cutoff_seq = int(max(all_seqs) * 0.80) |
| train_seqs = set(s for s in all_seqs if s <= cutoff_seq) |
| win_seqs = set(df_window["market_seq"].unique()) |
| overlap = train_seqs & win_seqs |
| clean = win_seqs - train_seqs |
|
|
| print(f"\ntraining cutoff : market_seq β€ {cutoff_seq}") |
| print(f"window seq range: {min(win_seqs)} β {max(win_seqs)}") |
| if overlap: |
| print(f"β overlap with train set : {len(overlap)} markets (partially contaminated)") |
| print(f"β clean (post-cutoff) : {len(clean)} markets") |
| else: |
| print(f"β fully clean β no overlap with training set") |
|
|
| if len(df_window) == 0: |
| print("\n[error] no data in this window β check your ts range") |
| sys.exit(1) |
|
|
| df_window.to_csv(OUT_FILE, index=False) |
| print(f"\nsaved β {OUT_FILE}") |
| print(f"\nnow run:") |
| print(f" python src/predict.py data/raw/{OUT_FILE.name}") |