File size: 3,353 Bytes
9cb5a00 | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 | """
ML-3m-trader Model
===================
LightGBM multi-class classifier for trade signal prediction.
Handles training, validation, prediction, and persistence.
"""
import os
import sys
import numpy as np
import pandas as pd
import joblib
import config as cfg
from features import get_feature_columns
try:
import lightgbm as lgb
except ImportError:
print("[ERROR] lightgbm not installed. Run: pip install lightgbm")
sys.exit(1)
def _ensure_dirs():
os.makedirs(cfg.MODEL_DIR, exist_ok=True)
def train(df: pd.DataFrame, labels: np.ndarray) -> lgb.LGBMClassifier:
"""
Train a LightGBM classifier on the provided feature DataFrame.
Uses chronological train/validation split (no shuffle).
Parameters
----------
df : pd.DataFrame
DataFrame with feature columns present.
labels : np.ndarray
Integer labels aligned with df index.
Returns
-------
lgb.LGBMClassifier
Trained model.
"""
feature_cols = get_feature_columns()
X = df[feature_cols].values.astype(np.float32)
y = labels.astype(np.int32)
split_idx = int(len(X) * cfg.TRAIN_SPLIT_RATIO)
X_train, X_val = X[:split_idx], X[split_idx:]
y_train, y_val = y[:split_idx], y[split_idx:]
print(f"[INFO] Training set : {X_train.shape[0]:,} samples")
print(f"[INFO] Validation set: {X_val.shape[0]:,} samples")
params = dict(cfg.LGBM_PARAMS)
n_est = params.pop("n_estimators", 500)
model = lgb.LGBMClassifier(n_estimators=n_est, **params)
model.fit(
X_train, y_train,
eval_set=[(X_val, y_val)],
eval_metric="multi_logloss",
callbacks=[
lgb.early_stopping(cfg.EARLY_STOPPING_ROUNDS, verbose=True),
lgb.log_evaluation(period=50),
],
)
# Feature importance
imp = pd.DataFrame({
"feature": feature_cols,
"importance": model.feature_importances_,
}).sort_values("importance", ascending=False)
print("\n[INFO] Feature importance (top 10):")
print(imp.head(10).to_string(index=False))
return model
def predict(model: lgb.LGBMClassifier, df: pd.DataFrame) -> np.ndarray:
"""
Generate predictions for the given DataFrame.
Returns
-------
np.ndarray of int
Predicted labels.
"""
feature_cols = get_feature_columns()
X = df[feature_cols].values.astype(np.float32)
return model.predict(X)
def predict_proba(model: lgb.LGBMClassifier, df: pd.DataFrame) -> np.ndarray:
"""Return class probabilities."""
feature_cols = get_feature_columns()
X = df[feature_cols].values.astype(np.float32)
return model.predict_proba(X)
def save_model(model: lgb.LGBMClassifier, filename: str = "lgbm_model.pkl"):
"""Persist model to disk via joblib."""
_ensure_dirs()
path = os.path.join(cfg.MODEL_DIR, filename)
joblib.dump(model, path)
print(f"[INFO] Model saved to {path}")
def load_model(filename: str = "lgbm_model.pkl") -> lgb.LGBMClassifier:
"""Load a previously saved model."""
path = os.path.join(cfg.MODEL_DIR, filename)
if not os.path.exists(path):
print(f"[ERROR] Model file not found: {path}")
print(" Run 'python main.py train' first.")
sys.exit(1)
model = joblib.load(path)
print(f"[INFO] Model loaded from {path}")
return model
|