| """ |
| ML-3m-trader Model |
| =================== |
| LightGBM multi-class classifier for trade signal prediction. |
| Handles training, validation, prediction, and persistence. |
| """ |
|
|
| import os |
| import sys |
|
|
| import numpy as np |
| import pandas as pd |
| import joblib |
|
|
| import config as cfg |
| from features import get_feature_columns |
|
|
| try: |
| import lightgbm as lgb |
| except ImportError: |
| print("[ERROR] lightgbm not installed. Run: pip install lightgbm") |
| sys.exit(1) |
|
|
|
|
| def _ensure_dirs(): |
| os.makedirs(cfg.MODEL_DIR, exist_ok=True) |
|
|
|
|
| def train(df: pd.DataFrame, labels: np.ndarray) -> lgb.LGBMClassifier: |
| """ |
| Train a LightGBM classifier on the provided feature DataFrame. |
| |
| Uses chronological train/validation split (no shuffle). |
| |
| Parameters |
| ---------- |
| df : pd.DataFrame |
| DataFrame with feature columns present. |
| labels : np.ndarray |
| Integer labels aligned with df index. |
| |
| Returns |
| ------- |
| lgb.LGBMClassifier |
| Trained model. |
| """ |
| feature_cols = get_feature_columns() |
| X = df[feature_cols].values.astype(np.float32) |
| y = labels.astype(np.int32) |
|
|
| split_idx = int(len(X) * cfg.TRAIN_SPLIT_RATIO) |
| X_train, X_val = X[:split_idx], X[split_idx:] |
| y_train, y_val = y[:split_idx], y[split_idx:] |
|
|
| print(f"[INFO] Training set : {X_train.shape[0]:,} samples") |
| print(f"[INFO] Validation set: {X_val.shape[0]:,} samples") |
|
|
| params = dict(cfg.LGBM_PARAMS) |
| n_est = params.pop("n_estimators", 500) |
|
|
| model = lgb.LGBMClassifier(n_estimators=n_est, **params) |
|
|
| model.fit( |
| X_train, y_train, |
| eval_set=[(X_val, y_val)], |
| eval_metric="multi_logloss", |
| callbacks=[ |
| lgb.early_stopping(cfg.EARLY_STOPPING_ROUNDS, verbose=True), |
| lgb.log_evaluation(period=50), |
| ], |
| ) |
|
|
| |
| imp = pd.DataFrame({ |
| "feature": feature_cols, |
| "importance": model.feature_importances_, |
| }).sort_values("importance", ascending=False) |
| print("\n[INFO] Feature importance (top 10):") |
| print(imp.head(10).to_string(index=False)) |
|
|
| return model |
|
|
|
|
| def predict(model: lgb.LGBMClassifier, df: pd.DataFrame) -> np.ndarray: |
| """ |
| Generate predictions for the given DataFrame. |
| |
| Returns |
| ------- |
| np.ndarray of int |
| Predicted labels. |
| """ |
| feature_cols = get_feature_columns() |
| X = df[feature_cols].values.astype(np.float32) |
| return model.predict(X) |
|
|
|
|
| def predict_proba(model: lgb.LGBMClassifier, df: pd.DataFrame) -> np.ndarray: |
| """Return class probabilities.""" |
| feature_cols = get_feature_columns() |
| X = df[feature_cols].values.astype(np.float32) |
| return model.predict_proba(X) |
|
|
|
|
| def save_model(model: lgb.LGBMClassifier, filename: str = "lgbm_model.pkl"): |
| """Persist model to disk via joblib.""" |
| _ensure_dirs() |
| path = os.path.join(cfg.MODEL_DIR, filename) |
| joblib.dump(model, path) |
| print(f"[INFO] Model saved to {path}") |
|
|
|
|
| def load_model(filename: str = "lgbm_model.pkl") -> lgb.LGBMClassifier: |
| """Load a previously saved model.""" |
| path = os.path.join(cfg.MODEL_DIR, filename) |
| if not os.path.exists(path): |
| print(f"[ERROR] Model file not found: {path}") |
| print(" Run 'python main.py train' first.") |
| sys.exit(1) |
| model = joblib.load(path) |
| print(f"[INFO] Model loaded from {path}") |
| return model |
|
|