btc-5min-polymarket-predictor / src /split_last24h.py
philippotiger's picture
Upload folder using huggingface_hub
1f59303 verified
# src/split_last24h.py
# Extracts arbitrary time windows from data/raw/swing_data.csv.
#
# Usage:
# python src/split_last24h.py β†’ last 0–24h (default)
# python src/split_last24h.py 48 24 β†’ last 48h–24h window
# python src/split_last24h.py 72 48 β†’ last 72h–48h window
# python src/split_last24h.py 24 0 β†’ last 24h–now (same as default)
import sys
from pathlib import Path
import pandas as pd
ROOT = Path(__file__).parent.parent
DATA_DIR = ROOT / "data" / "raw"
SRC_FILE = DATA_DIR / "swing_last.csv"
# Parse window args: hours_from, hours_to (both relative to max ts, going back)
hours_from = int(sys.argv[1]) if len(sys.argv) > 1 else 24 # older edge
hours_to = int(sys.argv[2]) if len(sys.argv) > 2 else 0 # newer edge
label = f"last{hours_from}h" if hours_to == 0 else f"{hours_from}h_to_{hours_to}h"
OUT_FILE = DATA_DIR / f"swing_{label}.csv"
print(f"reading {SRC_FILE} ...")
df = pd.read_csv(SRC_FILE)
ts_max = df["ts"].max()
ts_older = ts_max - hours_from * 3600
ts_newer = ts_max - hours_to * 3600
df_window = df[(df["ts"] >= ts_older) & (df["ts"] < ts_newer)].copy()
n_markets = df_window["market_seq"].nunique()
t_start = pd.to_datetime(df_window["ts"].min(), unit="s").strftime("%Y-%m-%d %H:%M UTC")
t_end = pd.to_datetime(df_window["ts"].max(), unit="s").strftime("%Y-%m-%d %H:%M UTC")
print(f"window : {t_start} β†’ {t_end} ({hours_from}h ago β†’ {hours_to}h ago)")
print(f"markets : {n_markets}")
print(f"rows : {len(df_window)}")
print(f"swing rate: {df_window['swing_occurred'].mean():.1%}")
# Sanity check vs training cutoff
all_seqs = sorted(df["market_seq"].unique())
cutoff_seq = int(max(all_seqs) * 0.80)
train_seqs = set(s for s in all_seqs if s <= cutoff_seq)
win_seqs = set(df_window["market_seq"].unique())
overlap = train_seqs & win_seqs
clean = win_seqs - train_seqs
print(f"\ntraining cutoff : market_seq ≀ {cutoff_seq}")
print(f"window seq range: {min(win_seqs)} – {max(win_seqs)}")
if overlap:
print(f"⚠ overlap with train set : {len(overlap)} markets (partially contaminated)")
print(f"βœ“ clean (post-cutoff) : {len(clean)} markets")
else:
print(f"βœ“ fully clean β€” no overlap with training set")
if len(df_window) == 0:
print("\n[error] no data in this window β€” check your ts range")
sys.exit(1)
df_window.to_csv(OUT_FILE, index=False)
print(f"\nsaved β†’ {OUT_FILE}")
print(f"\nnow run:")
print(f" python src/predict.py data/raw/{OUT_FILE.name}")