# src/split_last24h.py # Extracts arbitrary time windows from data/raw/swing_data.csv. # # Usage: # python src/split_last24h.py → last 0–24h (default) # python src/split_last24h.py 48 24 → last 48h–24h window # python src/split_last24h.py 72 48 → last 72h–48h window # python src/split_last24h.py 24 0 → last 24h–now (same as default) import sys from pathlib import Path import pandas as pd ROOT = Path(__file__).parent.parent DATA_DIR = ROOT / "data" / "raw" SRC_FILE = DATA_DIR / "swing_last.csv" # Parse window args: hours_from, hours_to (both relative to max ts, going back) hours_from = int(sys.argv[1]) if len(sys.argv) > 1 else 24 # older edge hours_to = int(sys.argv[2]) if len(sys.argv) > 2 else 0 # newer edge label = f"last{hours_from}h" if hours_to == 0 else f"{hours_from}h_to_{hours_to}h" OUT_FILE = DATA_DIR / f"swing_{label}.csv" print(f"reading {SRC_FILE} ...") df = pd.read_csv(SRC_FILE) ts_max = df["ts"].max() ts_older = ts_max - hours_from * 3600 ts_newer = ts_max - hours_to * 3600 df_window = df[(df["ts"] >= ts_older) & (df["ts"] < ts_newer)].copy() n_markets = df_window["market_seq"].nunique() t_start = pd.to_datetime(df_window["ts"].min(), unit="s").strftime("%Y-%m-%d %H:%M UTC") t_end = pd.to_datetime(df_window["ts"].max(), unit="s").strftime("%Y-%m-%d %H:%M UTC") print(f"window : {t_start} → {t_end} ({hours_from}h ago → {hours_to}h ago)") print(f"markets : {n_markets}") print(f"rows : {len(df_window)}") print(f"swing rate: {df_window['swing_occurred'].mean():.1%}") # Sanity check vs training cutoff all_seqs = sorted(df["market_seq"].unique()) cutoff_seq = int(max(all_seqs) * 0.80) train_seqs = set(s for s in all_seqs if s <= cutoff_seq) win_seqs = set(df_window["market_seq"].unique()) overlap = train_seqs & win_seqs clean = win_seqs - train_seqs print(f"\ntraining cutoff : market_seq ≤ {cutoff_seq}") print(f"window seq range: {min(win_seqs)} – {max(win_seqs)}") if overlap: print(f"⚠ overlap with train set : {len(overlap)} markets (partially contaminated)") print(f"✓ clean (post-cutoff) : {len(clean)} markets") else: print(f"✓ fully clean — no overlap with training set") if len(df_window) == 0: print("\n[error] no data in this window — check your ts range") sys.exit(1) df_window.to_csv(OUT_FILE, index=False) print(f"\nsaved → {OUT_FILE}") print(f"\nnow run:") print(f" python src/predict.py data/raw/{OUT_FILE.name}")