# src/split_last24h.py
# Extracts arbitrary time windows from data/raw/swing_data.csv.
#
# Usage:
#   python src/split_last24h.py                  → last 0–24h  (default)
#   python src/split_last24h.py 48 24            → last 48h–24h window
#   python src/split_last24h.py 72 48            → last 72h–48h window
#   python src/split_last24h.py 24 0             → last 24h–now (same as default)

import sys
from pathlib import Path
import pandas as pd

ROOT     = Path(__file__).parent.parent
DATA_DIR = ROOT / "data" / "raw"
SRC_FILE = DATA_DIR / "swing_last.csv"

# Parse window args: hours_from, hours_to  (both relative to max ts, going back)
hours_from = int(sys.argv[1]) if len(sys.argv) > 1 else 24   # older edge
hours_to   = int(sys.argv[2]) if len(sys.argv) > 2 else 0    # newer edge

label    = f"last{hours_from}h" if hours_to == 0 else f"{hours_from}h_to_{hours_to}h"
OUT_FILE = DATA_DIR / f"swing_{label}.csv"

print(f"reading {SRC_FILE} ...")
df = pd.read_csv(SRC_FILE)

ts_max      = df["ts"].max()
ts_older    = ts_max - hours_from * 3600
ts_newer    = ts_max - hours_to   * 3600

df_window = df[(df["ts"] >= ts_older) & (df["ts"] < ts_newer)].copy()

n_markets = df_window["market_seq"].nunique()
t_start   = pd.to_datetime(df_window["ts"].min(), unit="s").strftime("%Y-%m-%d %H:%M UTC")
t_end     = pd.to_datetime(df_window["ts"].max(), unit="s").strftime("%Y-%m-%d %H:%M UTC")

print(f"window    : {t_start} → {t_end}  ({hours_from}h ago → {hours_to}h ago)")
print(f"markets   : {n_markets}")
print(f"rows      : {len(df_window)}")
print(f"swing rate: {df_window['swing_occurred'].mean():.1%}")

# Sanity check vs training cutoff
all_seqs   = sorted(df["market_seq"].unique())
cutoff_seq = int(max(all_seqs) * 0.80)
train_seqs = set(s for s in all_seqs if s <= cutoff_seq)
win_seqs   = set(df_window["market_seq"].unique())
overlap    = train_seqs & win_seqs
clean      = win_seqs - train_seqs

print(f"\ntraining cutoff : market_seq ≤ {cutoff_seq}")
print(f"window seq range: {min(win_seqs)} – {max(win_seqs)}")
if overlap:
    print(f"⚠ overlap with train set : {len(overlap)} markets  (partially contaminated)")
    print(f"✓ clean (post-cutoff)    : {len(clean)} markets")
else:
    print(f"✓ fully clean — no overlap with training set")

if len(df_window) == 0:
    print("\n[error] no data in this window — check your ts range")
    sys.exit(1)

df_window.to_csv(OUT_FILE, index=False)
print(f"\nsaved → {OUT_FILE}")
print(f"\nnow run:")
print(f"  python src/predict.py data/raw/{OUT_FILE.name}")