philippotiger
/

btc-5min-polymarket-predictor

Tabular Classification

prediction-markets

binary-classification

Model card Files Files and versions

btc-5min-polymarket-predictor / src /split_last24h.py

philippotiger's picture

Upload folder using huggingface_hub

1f59303 verified 25 days ago

history blame contribute delete

2.56 kB

	# src/split_last24h.py
	# Extracts arbitrary time windows from data/raw/swing_data.csv.
	#
	# Usage:
	# python src/split_last24h.py → last 0–24h (default)
	# python src/split_last24h.py 48 24 → last 48h–24h window
	# python src/split_last24h.py 72 48 → last 72h–48h window
	# python src/split_last24h.py 24 0 → last 24h–now (same as default)

	import sys
	from pathlib import Path
	import pandas as pd

	ROOT = Path(__file__).parent.parent
	DATA_DIR = ROOT / "data" / "raw"
	SRC_FILE = DATA_DIR / "swing_last.csv"

	# Parse window args: hours_from, hours_to (both relative to max ts, going back)
	hours_from = int(sys.argv[1]) if len(sys.argv) > 1 else 24 # older edge
	hours_to = int(sys.argv[2]) if len(sys.argv) > 2 else 0 # newer edge

	label = f"last{hours_from}h" if hours_to == 0 else f"{hours_from}h_to_{hours_to}h"
	OUT_FILE = DATA_DIR / f"swing_{label}.csv"

	print(f"reading {SRC_FILE} ...")
	df = pd.read_csv(SRC_FILE)

	ts_max = df["ts"].max()
	ts_older = ts_max - hours_from * 3600
	ts_newer = ts_max - hours_to * 3600

	df_window = df[(df["ts"] >= ts_older) & (df["ts"] < ts_newer)].copy()

	n_markets = df_window["market_seq"].nunique()
	t_start = pd.to_datetime(df_window["ts"].min(), unit="s").strftime("%Y-%m-%d %H:%M UTC")
	t_end = pd.to_datetime(df_window["ts"].max(), unit="s").strftime("%Y-%m-%d %H:%M UTC")

	print(f"window : {t_start} → {t_end} ({hours_from}h ago → {hours_to}h ago)")
	print(f"markets : {n_markets}")
	print(f"rows : {len(df_window)}")
	print(f"swing rate: {df_window['swing_occurred'].mean():.1%}")

	# Sanity check vs training cutoff
	all_seqs = sorted(df["market_seq"].unique())
	cutoff_seq = int(max(all_seqs) * 0.80)
	train_seqs = set(s for s in all_seqs if s <= cutoff_seq)
	win_seqs = set(df_window["market_seq"].unique())
	overlap = train_seqs & win_seqs
	clean = win_seqs - train_seqs

	print(f"\ntraining cutoff : market_seq ≤ {cutoff_seq}")
	print(f"window seq range: {min(win_seqs)} – {max(win_seqs)}")
	if overlap:
	print(f"⚠ overlap with train set : {len(overlap)} markets (partially contaminated)")
	print(f"✓ clean (post-cutoff) : {len(clean)} markets")
	else:
	print(f"✓ fully clean — no overlap with training set")

	if len(df_window) == 0:
	print("\n[error] no data in this window — check your ts range")
	sys.exit(1)

	df_window.to_csv(OUT_FILE, index=False)
	print(f"\nsaved → {OUT_FILE}")
	print(f"\nnow run:")
	print(f" python src/predict.py data/raw/{OUT_FILE.name}")