| |
| |
| |
| |
| |
| |
| |
|
|
| import sys, time, warnings, pickle |
| from pathlib import Path |
|
|
| import pandas as pd |
| import numpy as np |
| import lightgbm as lgb |
| from sklearn.metrics import roc_auc_score, average_precision_score, log_loss |
| from sklearn.linear_model import LogisticRegression |
| from sklearn.preprocessing import StandardScaler |
| from sklearn.pipeline import Pipeline |
| from sklearn.model_selection import cross_val_score, GroupKFold |
| from rich.live import Live |
| from rich.table import Table |
| from rich.progress import Progress, BarColumn, TimeElapsedColumn, TimeRemainingColumn |
| from rich.layout import Layout |
| from rich.panel import Panel |
| from rich.console import Console |
| from rich import box |
|
|
| |
| ROOT = Path(__file__).parent.parent |
| DATA_DIR = ROOT / "data" / "raw" |
| MODEL_DIR = ROOT / "models" |
| RESULT_DIR = ROOT / "results" |
|
|
| for d in [MODEL_DIR, RESULT_DIR]: |
| d.mkdir(parents=True, exist_ok=True) |
|
|
| sys.path.insert(0, str(ROOT / "src")) |
| from features import build_features, get_entry_snapshot, get_feature_cols, TARGET_COL, GROUP_COL |
|
|
| warnings.filterwarnings("ignore") |
| console = Console() |
|
|
| |
| args = sys.argv[1:] |
|
|
| |
| if "--model" in args: |
| idx = args.index("--model") |
| MODEL_VERSION = args[idx + 1] |
| args = [a for i, a in enumerate(args) if i not in (idx, idx + 1)] |
| else: |
| |
| existing = sorted(MODEL_DIR.glob("lgbm_swing_v*.pkl")) |
| nums = [] |
| for p in existing: |
| try: |
| nums.append(int(p.stem.split("_v")[-1])) |
| except ValueError: |
| pass |
| MODEL_VERSION = f"v{max(nums) + 1}" if nums else "v1" |
|
|
| |
| if args: |
| csv_files = [Path(a) for a in args] |
| else: |
| csv_files = sorted(DATA_DIR.glob("*.csv")) |
|
|
| if not csv_files: |
| console.print(f"[red]No CSV files found in {DATA_DIR}[/]") |
| sys.exit(1) |
|
|
| MODEL_OUT = MODEL_DIR / f"lgbm_swing_{MODEL_VERSION}.pkl" |
|
|
| console.print(f"\n model version : [bold]{MODEL_VERSION}[/] β {MODEL_OUT.name}") |
| if MODEL_OUT.exists(): |
| console.print(f" [yellow]warning: {MODEL_OUT.name} already exists β will overwrite[/]") |
|
|
| |
| LGBM_PARAMS = dict( |
| n_estimators = 500, |
| learning_rate = 0.02, |
| max_depth = 3, |
| num_leaves = 6, |
| min_child_samples= 25, |
| subsample = 0.65, |
| colsample_bytree = 0.55, |
| reg_lambda = 2.0, |
| reg_alpha = 0.1, |
| random_state = 42, |
| verbose = -1, |
| ) |
|
|
| |
| console.rule("[bold]Step 1 β Load data") |
| frames = [] |
| for f in csv_files: |
| console.print(f" reading {f.name} β¦") |
| frames.append(pd.read_csv(f)) |
| df = pd.concat(frames, ignore_index=True) |
| console.print(f" shape : {df.shape}") |
| console.print(f" markets : {df[GROUP_COL].nunique()}") |
| console.print(f" swing rate : {df[TARGET_COL].mean():.1%}") |
|
|
| |
| console.rule("[bold]Step 2 β Features & entry snapshot") |
| df = build_features(df) |
| df_model = get_entry_snapshot(df) |
| feature_cols = get_feature_cols(df_model) |
|
|
| console.print(f" markets with entry tick : {len(df_model)}") |
| console.print(f" swing rate at entry : {df_model[TARGET_COL].mean():.1%}") |
| console.print(f" features kept : {len(feature_cols)}") |
|
|
| X = df_model[feature_cols].fillna(0) |
| y = df_model[TARGET_COL] |
| groups = df_model[GROUP_COL] |
|
|
| |
| cutoff = int(groups.max() * 0.80) |
| train_mask = groups <= cutoff |
| test_mask = groups > cutoff |
| X_train, X_test = X[train_mask], X[test_mask] |
| y_train, y_test = y[train_mask], y[test_mask] |
| console.print(f"\n train : {train_mask.sum()} markets ({y_train.sum()} pos / {(y_train==0).sum()} neg)") |
| console.print(f" test : {test_mask.sum()} markets ({y_test.sum()} pos / {(y_test==0).sum()} neg)") |
|
|
| |
| console.rule("[bold]Step 3 β Baseline (logistic regression)") |
| baseline = Pipeline([ |
| ("scaler", StandardScaler()), |
| ("lr", LogisticRegression(class_weight="balanced", max_iter=1000)) |
| ]) |
| baseline.fit(X_train, y_train) |
| b_proba = baseline.predict_proba(X_test)[:, 1] |
| b_auc = roc_auc_score(y_test, b_proba) |
| b_prauc = average_precision_score(y_test, b_proba) |
| console.print(f" baseline ROC-AUC : {b_auc:.3f}") |
| console.print(f" baseline PR-AUC : {b_prauc:.3f}") |
|
|
| |
| console.rule("[bold]Step 4 β LightGBM training") |
| neg, pos = (y_train == 0).sum(), (y_train == 1).sum() |
| params = {**LGBM_PARAMS, "scale_pos_weight": neg / pos} |
|
|
| metrics_history = [] |
| start_time = time.time() |
|
|
| progress = Progress( |
| "[progress.description]{task.description}", |
| BarColumn(), |
| "[progress.percentage]{task.percentage:>5.1f}%", |
| TimeElapsedColumn(), |
| TimeRemainingColumn(), |
| ) |
| task = progress.add_task("training", total=params["n_estimators"]) |
|
|
| def make_table(h): |
| t = Table(box=box.SIMPLE, show_header=True, header_style="bold") |
| t.add_column("iter", style="dim", width=6) |
| t.add_column("train loss", width=12) |
| t.add_column("val loss", width=12) |
| t.add_column("val AUC", width=10) |
| t.add_column("val PR-AUC", width=10) |
| t.add_column("Ξ loss", width=10) |
| best_auc = max(m["auc"] for m in h) if h else 0 |
| for i, r in enumerate(h[-8:]): |
| abs_i = len(h) - len(h[-8:]) + i |
| delta = "" |
| if abs_i > 0: |
| d = r["val_loss"] - h[abs_i - 1]["val_loss"] |
| delta = f"[green]β{abs(d):.4f}[/]" if d < 0 else f"[yellow]β{abs(d):.4f}[/]" |
| auc_str = f"[bold green]{r['auc']:.3f}[/]" if r["auc"] == best_auc else f"{r['auc']:.3f}" |
| t.add_row(str(r["iter"]), f"{r['train_loss']:.4f}", f"{r['val_loss']:.4f}", |
| auc_str, f"{r['prauc']:.3f}", delta) |
| return t |
|
|
| class DashCallback: |
| order = 1 |
| def __init__(self, live): self.live = live |
| def __call__(self, env): |
| it = env.iteration + 1 |
| if it % 10 != 0 and it != 1: return |
| ed = {f"{r[0]} {r[1]}": r[2] for r in env.evaluation_result_list} |
| tl = ed.get("training binary_logloss") |
| vl = ed.get("valid_1 binary_logloss") |
| pv = env.model.predict(X_test) |
| auc = roc_auc_score(y_test, pv) |
| pr = average_precision_score(y_test, pv) |
| if tl and vl: |
| metrics_history.append({"iter": it, "train_loss": tl, "val_loss": vl, "auc": auc, "prauc": pr}) |
| elapsed = time.time() - start_time |
| rate = it / elapsed if elapsed > 0 else 0 |
| best_auc = max(m["auc"] for m in metrics_history) if metrics_history else 0 |
| layout = Layout() |
| layout.split_column( |
| Layout(Panel(make_table(metrics_history), title="metrics log", border_style="blue"), name="table"), |
| Layout(Panel( |
| f" iter {it}/{params['n_estimators']} elapsed {int(elapsed)}s rate {rate:.1f} iter/s\n" |
| f" best AUC [bold green]{best_auc:.3f}[/] val loss {vl:.4f} train loss {tl:.4f}", |
| title="status", border_style="dim"), name="status", size=5), |
| ) |
| self.live.update(layout) |
| progress.update(task, completed=it) |
|
|
| with Live(console=console, refresh_per_second=4) as live: |
| model = lgb.LGBMClassifier(**params) |
| model.fit( |
| X_train, y_train, |
| eval_set=[(X_train, y_train), (X_test, y_test)], |
| eval_metric="binary_logloss", |
| callbacks=[lgb.early_stopping(50, verbose=False), lgb.log_evaluation(-1), DashCallback(live)], |
| ) |
|
|
| progress.stop() |
|
|
| |
| console.rule("[bold]Step 5 β Cross-validation (5-fold)") |
| gkf = GroupKFold(n_splits=5) |
| cv_params= {**params, "n_estimators": model.best_iteration_ or 100} |
| cv_model = lgb.LGBMClassifier(**cv_params) |
| cv_auc = cross_val_score(cv_model, X, y, groups=groups, cv=gkf, scoring="roc_auc") |
| cv_pr = cross_val_score(cv_model, X, y, groups=groups, cv=gkf, scoring="average_precision") |
| console.print(f" CV ROC-AUC : {cv_auc.mean():.3f} Β± {cv_auc.std():.3f} folds: {[f'{v:.3f}' for v in cv_auc]}") |
| console.print(f" CV PR-AUC : {cv_pr.mean():.3f} Β± {cv_pr.std():.3f} folds: {[f'{v:.3f}' for v in cv_pr]}") |
|
|
| |
| console.rule("[bold]Final results") |
| y_pred = model.predict_proba(X_test)[:, 1] |
| auc_final = roc_auc_score(y_test, y_pred) |
| prauc_final = average_precision_score(y_test, y_pred) |
| ll_final = log_loss(y_test, y_pred) |
| train_auc = roc_auc_score(y_train, model.predict_proba(X_train)[:, 1]) |
| gap = train_auc - auc_final |
|
|
| console.print(f" ROC-AUC : {auc_final:.3f} (baseline {b_auc:.3f})") |
| console.print(f" PR-AUC : {prauc_final:.3f} (baseline {b_prauc:.3f})") |
| console.print(f" log loss : {ll_final:.4f}") |
| console.print(f" best iter: {model.best_iteration_}") |
| console.print(f" train/test gap : {gap:.3f} {'[yellow]still overfitting[/]' if gap > 0.10 else '[green]healthy[/]'}") |
| console.print(f" CV AUC : {cv_auc.mean():.3f} Β± {cv_auc.std():.3f} {'[green]beats baseline[/]' if cv_auc.mean() > b_auc else '[yellow]does not beat baseline on CV[/]'}") |
|
|
| |
| from sklearn.metrics import precision_recall_curve |
| pvals, rvals, thresholds = precision_recall_curve(y_test, y_pred) |
| console.print(f"\n {'threshold':>10} {'precision':>10} {'recall':>8} {'trades':>7} {'EV/trade':>10}") |
| prev_p = 0 |
| for p, r, t in zip(pvals, rvals, thresholds): |
| n = int((y_pred >= t).sum()) |
| if p >= 0.55 and n >= 5 and (abs(p - prev_p) >= 0.02 or n in [100,75,50,40,30,20,10]): |
| ev = p * 0.70 - (1 - p) * 0.30 |
| console.print(f" {t:>10.2f} {p:>10.2f} {r:>8.2f} {n:>7} {ev:>+10.3f}") |
| prev_p = p |
| if p >= 0.85: break |
|
|
| |
| console.rule("[bold]Top 20 features") |
| feat_imp = pd.Series(model.feature_importances_, index=feature_cols).sort_values(ascending=False) |
| for fname, score in feat_imp.head(20).items(): |
| bar = "β" * int(score / feat_imp.iloc[0] * 30) |
| console.print(f" {fname:<25} {bar} {score}") |
|
|
| |
| payload = { |
| "model": model, |
| "feature_cols": feature_cols, |
| "params": params, |
| "baseline_auc": b_auc, |
| "lgbm_auc": auc_final, |
| "lgbm_prauc": prauc_final, |
| "cv_auc_mean": float(cv_auc.mean()), |
| "cv_auc_std": float(cv_auc.std()), |
| "train_test_gap": float(gap), |
| "version": MODEL_VERSION, |
| } |
| with open(MODEL_OUT, "wb") as f: |
| pickle.dump(payload, f) |
|
|
| metrics_df = pd.DataFrame(metrics_history) |
| metrics_df.to_csv(RESULT_DIR / f"training_metrics_{MODEL_VERSION}.csv", index=False) |
| feat_imp.to_csv(RESULT_DIR / f"feature_importance_{MODEL_VERSION}.csv", header=["importance"]) |
|
|
| console.print(f"\n [green]model β {MODEL_OUT}[/]") |
| console.print(f" [green]metrics β {RESULT_DIR}/training_metrics_{MODEL_VERSION}.csv[/]") |
| console.print(f" [green]featuresβ {RESULT_DIR}/feature_importance_{MODEL_VERSION}.csv[/]") |