# src/train.py # # Usage: # python src/train.py → all CSVs in data/raw/, auto version # python src/train.py data/raw/file.csv → one file, auto version # python src/train.py data/raw/file.csv --model v5 → one file, explicit version # python src/train.py --model v5 → all CSVs, explicit version (overwrites) import sys, time, warnings, pickle from pathlib import Path import pandas as pd import numpy as np import lightgbm as lgb from sklearn.metrics import roc_auc_score, average_precision_score, log_loss from sklearn.linear_model import LogisticRegression from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline from sklearn.model_selection import cross_val_score, GroupKFold from rich.live import Live from rich.table import Table from rich.progress import Progress, BarColumn, TimeElapsedColumn, TimeRemainingColumn from rich.layout import Layout from rich.panel import Panel from rich.console import Console from rich import box # ── Paths ────────────────────────────────────────────────────────────────── ROOT = Path(__file__).parent.parent DATA_DIR = ROOT / "data" / "raw" MODEL_DIR = ROOT / "models" RESULT_DIR = ROOT / "results" for d in [MODEL_DIR, RESULT_DIR]: d.mkdir(parents=True, exist_ok=True) sys.path.insert(0, str(ROOT / "src")) from features import build_features, get_entry_snapshot, get_feature_cols, TARGET_COL, GROUP_COL warnings.filterwarnings("ignore") console = Console() # ── Argument parsing ─────────────────────────────────────────────────────── args = sys.argv[1:] # --model flag: explicit version name if "--model" in args: idx = args.index("--model") MODEL_VERSION = args[idx + 1] args = [a for i, a in enumerate(args) if i not in (idx, idx + 1)] else: # auto-increment: scan models/ for highest vN, add 1 existing = sorted(MODEL_DIR.glob("lgbm_swing_v*.pkl")) nums = [] for p in existing: try: nums.append(int(p.stem.split("_v")[-1])) except ValueError: pass MODEL_VERSION = f"v{max(nums) + 1}" if nums else "v1" # remaining args are CSV file paths if args: csv_files = [Path(a) for a in args] else: csv_files = sorted(DATA_DIR.glob("*.csv")) if not csv_files: console.print(f"[red]No CSV files found in {DATA_DIR}[/]") sys.exit(1) MODEL_OUT = MODEL_DIR / f"lgbm_swing_{MODEL_VERSION}.pkl" console.print(f"\n model version : [bold]{MODEL_VERSION}[/] → {MODEL_OUT.name}") if MODEL_OUT.exists(): console.print(f" [yellow]warning: {MODEL_OUT.name} already exists — will overwrite[/]") # ── Model params ─────────────────────────────────────────────────────────── LGBM_PARAMS = dict( n_estimators = 500, learning_rate = 0.02, max_depth = 3, num_leaves = 6, min_child_samples= 25, subsample = 0.65, colsample_bytree = 0.55, reg_lambda = 2.0, reg_alpha = 0.1, random_state = 42, verbose = -1, ) # ── 1. Load ──────────────────────────────────────────────────────────────── console.rule("[bold]Step 1 — Load data") frames = [] for f in csv_files: console.print(f" reading {f.name} …") frames.append(pd.read_csv(f)) df = pd.concat(frames, ignore_index=True) console.print(f" shape : {df.shape}") console.print(f" markets : {df[GROUP_COL].nunique()}") console.print(f" swing rate : {df[TARGET_COL].mean():.1%}") # ── 2. Feature engineering + entry snapshot ─────────────────────────────── console.rule("[bold]Step 2 — Features & entry snapshot") df = build_features(df) df_model = get_entry_snapshot(df) feature_cols = get_feature_cols(df_model) console.print(f" markets with entry tick : {len(df_model)}") console.print(f" swing rate at entry : {df_model[TARGET_COL].mean():.1%}") console.print(f" features kept : {len(feature_cols)}") X = df_model[feature_cols].fillna(0) y = df_model[TARGET_COL] groups = df_model[GROUP_COL] # ── 3. Train / test split (time-based) ──────────────────────────────────── cutoff = int(groups.max() * 0.80) train_mask = groups <= cutoff test_mask = groups > cutoff X_train, X_test = X[train_mask], X[test_mask] y_train, y_test = y[train_mask], y[test_mask] console.print(f"\n train : {train_mask.sum()} markets ({y_train.sum()} pos / {(y_train==0).sum()} neg)") console.print(f" test : {test_mask.sum()} markets ({y_test.sum()} pos / {(y_test==0).sum()} neg)") # ── 4. Baseline ──────────────────────────────────────────────────────────── console.rule("[bold]Step 3 — Baseline (logistic regression)") baseline = Pipeline([ ("scaler", StandardScaler()), ("lr", LogisticRegression(class_weight="balanced", max_iter=1000)) ]) baseline.fit(X_train, y_train) b_proba = baseline.predict_proba(X_test)[:, 1] b_auc = roc_auc_score(y_test, b_proba) b_prauc = average_precision_score(y_test, b_proba) console.print(f" baseline ROC-AUC : {b_auc:.3f}") console.print(f" baseline PR-AUC : {b_prauc:.3f}") # ── 5. LightGBM ──────────────────────────────────────────────────────────── console.rule("[bold]Step 4 — LightGBM training") neg, pos = (y_train == 0).sum(), (y_train == 1).sum() params = {**LGBM_PARAMS, "scale_pos_weight": neg / pos} metrics_history = [] start_time = time.time() progress = Progress( "[progress.description]{task.description}", BarColumn(), "[progress.percentage]{task.percentage:>5.1f}%", TimeElapsedColumn(), TimeRemainingColumn(), ) task = progress.add_task("training", total=params["n_estimators"]) def make_table(h): t = Table(box=box.SIMPLE, show_header=True, header_style="bold") t.add_column("iter", style="dim", width=6) t.add_column("train loss", width=12) t.add_column("val loss", width=12) t.add_column("val AUC", width=10) t.add_column("val PR-AUC", width=10) t.add_column("Δ loss", width=10) best_auc = max(m["auc"] for m in h) if h else 0 for i, r in enumerate(h[-8:]): abs_i = len(h) - len(h[-8:]) + i delta = "" if abs_i > 0: d = r["val_loss"] - h[abs_i - 1]["val_loss"] delta = f"[green]↓{abs(d):.4f}[/]" if d < 0 else f"[yellow]↑{abs(d):.4f}[/]" auc_str = f"[bold green]{r['auc']:.3f}[/]" if r["auc"] == best_auc else f"{r['auc']:.3f}" t.add_row(str(r["iter"]), f"{r['train_loss']:.4f}", f"{r['val_loss']:.4f}", auc_str, f"{r['prauc']:.3f}", delta) return t class DashCallback: order = 1 def __init__(self, live): self.live = live def __call__(self, env): it = env.iteration + 1 if it % 10 != 0 and it != 1: return ed = {f"{r[0]} {r[1]}": r[2] for r in env.evaluation_result_list} tl = ed.get("training binary_logloss") vl = ed.get("valid_1 binary_logloss") pv = env.model.predict(X_test) auc = roc_auc_score(y_test, pv) pr = average_precision_score(y_test, pv) if tl and vl: metrics_history.append({"iter": it, "train_loss": tl, "val_loss": vl, "auc": auc, "prauc": pr}) elapsed = time.time() - start_time rate = it / elapsed if elapsed > 0 else 0 best_auc = max(m["auc"] for m in metrics_history) if metrics_history else 0 layout = Layout() layout.split_column( Layout(Panel(make_table(metrics_history), title="metrics log", border_style="blue"), name="table"), Layout(Panel( f" iter {it}/{params['n_estimators']} elapsed {int(elapsed)}s rate {rate:.1f} iter/s\n" f" best AUC [bold green]{best_auc:.3f}[/] val loss {vl:.4f} train loss {tl:.4f}", title="status", border_style="dim"), name="status", size=5), ) self.live.update(layout) progress.update(task, completed=it) with Live(console=console, refresh_per_second=4) as live: model = lgb.LGBMClassifier(**params) model.fit( X_train, y_train, eval_set=[(X_train, y_train), (X_test, y_test)], eval_metric="binary_logloss", callbacks=[lgb.early_stopping(50, verbose=False), lgb.log_evaluation(-1), DashCallback(live)], ) progress.stop() # ── 6. Cross-validation ──────────────────────────────────────────────────── console.rule("[bold]Step 5 — Cross-validation (5-fold)") gkf = GroupKFold(n_splits=5) cv_params= {**params, "n_estimators": model.best_iteration_ or 100} cv_model = lgb.LGBMClassifier(**cv_params) cv_auc = cross_val_score(cv_model, X, y, groups=groups, cv=gkf, scoring="roc_auc") cv_pr = cross_val_score(cv_model, X, y, groups=groups, cv=gkf, scoring="average_precision") console.print(f" CV ROC-AUC : {cv_auc.mean():.3f} ± {cv_auc.std():.3f} folds: {[f'{v:.3f}' for v in cv_auc]}") console.print(f" CV PR-AUC : {cv_pr.mean():.3f} ± {cv_pr.std():.3f} folds: {[f'{v:.3f}' for v in cv_pr]}") # ── 7. Final report ──────────────────────────────────────────────────────── console.rule("[bold]Final results") y_pred = model.predict_proba(X_test)[:, 1] auc_final = roc_auc_score(y_test, y_pred) prauc_final = average_precision_score(y_test, y_pred) ll_final = log_loss(y_test, y_pred) train_auc = roc_auc_score(y_train, model.predict_proba(X_train)[:, 1]) gap = train_auc - auc_final console.print(f" ROC-AUC : {auc_final:.3f} (baseline {b_auc:.3f})") console.print(f" PR-AUC : {prauc_final:.3f} (baseline {b_prauc:.3f})") console.print(f" log loss : {ll_final:.4f}") console.print(f" best iter: {model.best_iteration_}") console.print(f" train/test gap : {gap:.3f} {'[yellow]still overfitting[/]' if gap > 0.10 else '[green]healthy[/]'}") console.print(f" CV AUC : {cv_auc.mean():.3f} ± {cv_auc.std():.3f} {'[green]beats baseline[/]' if cv_auc.mean() > b_auc else '[yellow]does not beat baseline on CV[/]'}") # Threshold table from sklearn.metrics import precision_recall_curve pvals, rvals, thresholds = precision_recall_curve(y_test, y_pred) console.print(f"\n {'threshold':>10} {'precision':>10} {'recall':>8} {'trades':>7} {'EV/trade':>10}") prev_p = 0 for p, r, t in zip(pvals, rvals, thresholds): n = int((y_pred >= t).sum()) if p >= 0.55 and n >= 5 and (abs(p - prev_p) >= 0.02 or n in [100,75,50,40,30,20,10]): ev = p * 0.70 - (1 - p) * 0.30 console.print(f" {t:>10.2f} {p:>10.2f} {r:>8.2f} {n:>7} {ev:>+10.3f}") prev_p = p if p >= 0.85: break # Feature importance console.rule("[bold]Top 20 features") feat_imp = pd.Series(model.feature_importances_, index=feature_cols).sort_values(ascending=False) for fname, score in feat_imp.head(20).items(): bar = "█" * int(score / feat_imp.iloc[0] * 30) console.print(f" {fname:<25} {bar} {score}") # ── 8. Save ──────────────────────────────────────────────────────────────── payload = { "model": model, "feature_cols": feature_cols, "params": params, "baseline_auc": b_auc, "lgbm_auc": auc_final, "lgbm_prauc": prauc_final, "cv_auc_mean": float(cv_auc.mean()), "cv_auc_std": float(cv_auc.std()), "train_test_gap": float(gap), "version": MODEL_VERSION, } with open(MODEL_OUT, "wb") as f: pickle.dump(payload, f) metrics_df = pd.DataFrame(metrics_history) metrics_df.to_csv(RESULT_DIR / f"training_metrics_{MODEL_VERSION}.csv", index=False) feat_imp.to_csv(RESULT_DIR / f"feature_importance_{MODEL_VERSION}.csv", header=["importance"]) console.print(f"\n [green]model → {MODEL_OUT}[/]") console.print(f" [green]metrics → {RESULT_DIR}/training_metrics_{MODEL_VERSION}.csv[/]") console.print(f" [green]features→ {RESULT_DIR}/feature_importance_{MODEL_VERSION}.csv[/]")