philippotiger's picture
Upload src/train.py with huggingface_hub
3f30366 verified
# src/train.py
#
# Usage:
# python src/train.py β†’ all CSVs in data/raw/, auto version
# python src/train.py data/raw/file.csv β†’ one file, auto version
# python src/train.py data/raw/file.csv --model v5 β†’ one file, explicit version
# python src/train.py --model v5 β†’ all CSVs, explicit version (overwrites)
import sys, time, warnings, pickle
from pathlib import Path
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.metrics import roc_auc_score, average_precision_score, log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, GroupKFold
from rich.live import Live
from rich.table import Table
from rich.progress import Progress, BarColumn, TimeElapsedColumn, TimeRemainingColumn
from rich.layout import Layout
from rich.panel import Panel
from rich.console import Console
from rich import box
# ── Paths ──────────────────────────────────────────────────────────────────
ROOT = Path(__file__).parent.parent
DATA_DIR = ROOT / "data" / "raw"
MODEL_DIR = ROOT / "models"
RESULT_DIR = ROOT / "results"
for d in [MODEL_DIR, RESULT_DIR]:
d.mkdir(parents=True, exist_ok=True)
sys.path.insert(0, str(ROOT / "src"))
from features import build_features, get_entry_snapshot, get_feature_cols, TARGET_COL, GROUP_COL
warnings.filterwarnings("ignore")
console = Console()
# ── Argument parsing ───────────────────────────────────────────────────────
args = sys.argv[1:]
# --model flag: explicit version name
if "--model" in args:
idx = args.index("--model")
MODEL_VERSION = args[idx + 1]
args = [a for i, a in enumerate(args) if i not in (idx, idx + 1)]
else:
# auto-increment: scan models/ for highest vN, add 1
existing = sorted(MODEL_DIR.glob("lgbm_swing_v*.pkl"))
nums = []
for p in existing:
try:
nums.append(int(p.stem.split("_v")[-1]))
except ValueError:
pass
MODEL_VERSION = f"v{max(nums) + 1}" if nums else "v1"
# remaining args are CSV file paths
if args:
csv_files = [Path(a) for a in args]
else:
csv_files = sorted(DATA_DIR.glob("*.csv"))
if not csv_files:
console.print(f"[red]No CSV files found in {DATA_DIR}[/]")
sys.exit(1)
MODEL_OUT = MODEL_DIR / f"lgbm_swing_{MODEL_VERSION}.pkl"
console.print(f"\n model version : [bold]{MODEL_VERSION}[/] β†’ {MODEL_OUT.name}")
if MODEL_OUT.exists():
console.print(f" [yellow]warning: {MODEL_OUT.name} already exists β€” will overwrite[/]")
# ── Model params ───────────────────────────────────────────────────────────
LGBM_PARAMS = dict(
n_estimators = 500,
learning_rate = 0.02,
max_depth = 3,
num_leaves = 6,
min_child_samples= 25,
subsample = 0.65,
colsample_bytree = 0.55,
reg_lambda = 2.0,
reg_alpha = 0.1,
random_state = 42,
verbose = -1,
)
# ── 1. Load ────────────────────────────────────────────────────────────────
console.rule("[bold]Step 1 β€” Load data")
frames = []
for f in csv_files:
console.print(f" reading {f.name} …")
frames.append(pd.read_csv(f))
df = pd.concat(frames, ignore_index=True)
console.print(f" shape : {df.shape}")
console.print(f" markets : {df[GROUP_COL].nunique()}")
console.print(f" swing rate : {df[TARGET_COL].mean():.1%}")
# ── 2. Feature engineering + entry snapshot ───────────────────────────────
console.rule("[bold]Step 2 β€” Features & entry snapshot")
df = build_features(df)
df_model = get_entry_snapshot(df)
feature_cols = get_feature_cols(df_model)
console.print(f" markets with entry tick : {len(df_model)}")
console.print(f" swing rate at entry : {df_model[TARGET_COL].mean():.1%}")
console.print(f" features kept : {len(feature_cols)}")
X = df_model[feature_cols].fillna(0)
y = df_model[TARGET_COL]
groups = df_model[GROUP_COL]
# ── 3. Train / test split (time-based) ────────────────────────────────────
cutoff = int(groups.max() * 0.80)
train_mask = groups <= cutoff
test_mask = groups > cutoff
X_train, X_test = X[train_mask], X[test_mask]
y_train, y_test = y[train_mask], y[test_mask]
console.print(f"\n train : {train_mask.sum()} markets ({y_train.sum()} pos / {(y_train==0).sum()} neg)")
console.print(f" test : {test_mask.sum()} markets ({y_test.sum()} pos / {(y_test==0).sum()} neg)")
# ── 4. Baseline ────────────────────────────────────────────────────────────
console.rule("[bold]Step 3 β€” Baseline (logistic regression)")
baseline = Pipeline([
("scaler", StandardScaler()),
("lr", LogisticRegression(class_weight="balanced", max_iter=1000))
])
baseline.fit(X_train, y_train)
b_proba = baseline.predict_proba(X_test)[:, 1]
b_auc = roc_auc_score(y_test, b_proba)
b_prauc = average_precision_score(y_test, b_proba)
console.print(f" baseline ROC-AUC : {b_auc:.3f}")
console.print(f" baseline PR-AUC : {b_prauc:.3f}")
# ── 5. LightGBM ────────────────────────────────────────────────────────────
console.rule("[bold]Step 4 β€” LightGBM training")
neg, pos = (y_train == 0).sum(), (y_train == 1).sum()
params = {**LGBM_PARAMS, "scale_pos_weight": neg / pos}
metrics_history = []
start_time = time.time()
progress = Progress(
"[progress.description]{task.description}",
BarColumn(),
"[progress.percentage]{task.percentage:>5.1f}%",
TimeElapsedColumn(),
TimeRemainingColumn(),
)
task = progress.add_task("training", total=params["n_estimators"])
def make_table(h):
t = Table(box=box.SIMPLE, show_header=True, header_style="bold")
t.add_column("iter", style="dim", width=6)
t.add_column("train loss", width=12)
t.add_column("val loss", width=12)
t.add_column("val AUC", width=10)
t.add_column("val PR-AUC", width=10)
t.add_column("Ξ” loss", width=10)
best_auc = max(m["auc"] for m in h) if h else 0
for i, r in enumerate(h[-8:]):
abs_i = len(h) - len(h[-8:]) + i
delta = ""
if abs_i > 0:
d = r["val_loss"] - h[abs_i - 1]["val_loss"]
delta = f"[green]↓{abs(d):.4f}[/]" if d < 0 else f"[yellow]↑{abs(d):.4f}[/]"
auc_str = f"[bold green]{r['auc']:.3f}[/]" if r["auc"] == best_auc else f"{r['auc']:.3f}"
t.add_row(str(r["iter"]), f"{r['train_loss']:.4f}", f"{r['val_loss']:.4f}",
auc_str, f"{r['prauc']:.3f}", delta)
return t
class DashCallback:
order = 1
def __init__(self, live): self.live = live
def __call__(self, env):
it = env.iteration + 1
if it % 10 != 0 and it != 1: return
ed = {f"{r[0]} {r[1]}": r[2] for r in env.evaluation_result_list}
tl = ed.get("training binary_logloss")
vl = ed.get("valid_1 binary_logloss")
pv = env.model.predict(X_test)
auc = roc_auc_score(y_test, pv)
pr = average_precision_score(y_test, pv)
if tl and vl:
metrics_history.append({"iter": it, "train_loss": tl, "val_loss": vl, "auc": auc, "prauc": pr})
elapsed = time.time() - start_time
rate = it / elapsed if elapsed > 0 else 0
best_auc = max(m["auc"] for m in metrics_history) if metrics_history else 0
layout = Layout()
layout.split_column(
Layout(Panel(make_table(metrics_history), title="metrics log", border_style="blue"), name="table"),
Layout(Panel(
f" iter {it}/{params['n_estimators']} elapsed {int(elapsed)}s rate {rate:.1f} iter/s\n"
f" best AUC [bold green]{best_auc:.3f}[/] val loss {vl:.4f} train loss {tl:.4f}",
title="status", border_style="dim"), name="status", size=5),
)
self.live.update(layout)
progress.update(task, completed=it)
with Live(console=console, refresh_per_second=4) as live:
model = lgb.LGBMClassifier(**params)
model.fit(
X_train, y_train,
eval_set=[(X_train, y_train), (X_test, y_test)],
eval_metric="binary_logloss",
callbacks=[lgb.early_stopping(50, verbose=False), lgb.log_evaluation(-1), DashCallback(live)],
)
progress.stop()
# ── 6. Cross-validation ────────────────────────────────────────────────────
console.rule("[bold]Step 5 β€” Cross-validation (5-fold)")
gkf = GroupKFold(n_splits=5)
cv_params= {**params, "n_estimators": model.best_iteration_ or 100}
cv_model = lgb.LGBMClassifier(**cv_params)
cv_auc = cross_val_score(cv_model, X, y, groups=groups, cv=gkf, scoring="roc_auc")
cv_pr = cross_val_score(cv_model, X, y, groups=groups, cv=gkf, scoring="average_precision")
console.print(f" CV ROC-AUC : {cv_auc.mean():.3f} Β± {cv_auc.std():.3f} folds: {[f'{v:.3f}' for v in cv_auc]}")
console.print(f" CV PR-AUC : {cv_pr.mean():.3f} Β± {cv_pr.std():.3f} folds: {[f'{v:.3f}' for v in cv_pr]}")
# ── 7. Final report ────────────────────────────────────────────────────────
console.rule("[bold]Final results")
y_pred = model.predict_proba(X_test)[:, 1]
auc_final = roc_auc_score(y_test, y_pred)
prauc_final = average_precision_score(y_test, y_pred)
ll_final = log_loss(y_test, y_pred)
train_auc = roc_auc_score(y_train, model.predict_proba(X_train)[:, 1])
gap = train_auc - auc_final
console.print(f" ROC-AUC : {auc_final:.3f} (baseline {b_auc:.3f})")
console.print(f" PR-AUC : {prauc_final:.3f} (baseline {b_prauc:.3f})")
console.print(f" log loss : {ll_final:.4f}")
console.print(f" best iter: {model.best_iteration_}")
console.print(f" train/test gap : {gap:.3f} {'[yellow]still overfitting[/]' if gap > 0.10 else '[green]healthy[/]'}")
console.print(f" CV AUC : {cv_auc.mean():.3f} Β± {cv_auc.std():.3f} {'[green]beats baseline[/]' if cv_auc.mean() > b_auc else '[yellow]does not beat baseline on CV[/]'}")
# Threshold table
from sklearn.metrics import precision_recall_curve
pvals, rvals, thresholds = precision_recall_curve(y_test, y_pred)
console.print(f"\n {'threshold':>10} {'precision':>10} {'recall':>8} {'trades':>7} {'EV/trade':>10}")
prev_p = 0
for p, r, t in zip(pvals, rvals, thresholds):
n = int((y_pred >= t).sum())
if p >= 0.55 and n >= 5 and (abs(p - prev_p) >= 0.02 or n in [100,75,50,40,30,20,10]):
ev = p * 0.70 - (1 - p) * 0.30
console.print(f" {t:>10.2f} {p:>10.2f} {r:>8.2f} {n:>7} {ev:>+10.3f}")
prev_p = p
if p >= 0.85: break
# Feature importance
console.rule("[bold]Top 20 features")
feat_imp = pd.Series(model.feature_importances_, index=feature_cols).sort_values(ascending=False)
for fname, score in feat_imp.head(20).items():
bar = "β–ˆ" * int(score / feat_imp.iloc[0] * 30)
console.print(f" {fname:<25} {bar} {score}")
# ── 8. Save ────────────────────────────────────────────────────────────────
payload = {
"model": model,
"feature_cols": feature_cols,
"params": params,
"baseline_auc": b_auc,
"lgbm_auc": auc_final,
"lgbm_prauc": prauc_final,
"cv_auc_mean": float(cv_auc.mean()),
"cv_auc_std": float(cv_auc.std()),
"train_test_gap": float(gap),
"version": MODEL_VERSION,
}
with open(MODEL_OUT, "wb") as f:
pickle.dump(payload, f)
metrics_df = pd.DataFrame(metrics_history)
metrics_df.to_csv(RESULT_DIR / f"training_metrics_{MODEL_VERSION}.csv", index=False)
feat_imp.to_csv(RESULT_DIR / f"feature_importance_{MODEL_VERSION}.csv", header=["importance"])
console.print(f"\n [green]model β†’ {MODEL_OUT}[/]")
console.print(f" [green]metrics β†’ {RESULT_DIR}/training_metrics_{MODEL_VERSION}.csv[/]")
console.print(f" [green]features→ {RESULT_DIR}/feature_importance_{MODEL_VERSION}.csv[/]")