Spaces:

AlgoX
/

mlStocks-pred

Sleeping

App Files Files Community

AlgoX commited on Oct 23, 2025

Commit

77a4f6b

1 Parent(s): b6f3206

feat : add training script for time-series model + classical models

Browse files

Files changed (1) hide show

train/classical_train.py +652 -0

train/classical_train.py ADDED Viewed

	@@ -0,0 +1,652 @@

+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import seaborn as sns
+from sklearn.ensemble import (
+    RandomForestRegressor,
+    GradientBoostingRegressor,
+    ExtraTreesRegressor,
+)
+from sklearn.linear_model import Ridge, Lasso, ElasticNet
+from sklearn.svm import SVR
+from sklearn.preprocessing import StandardScaler
+from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
+import xgboost as xgb
+import lightgbm as lgb
+from statsmodels.tsa.arima.model import ARIMA
+from statsmodels.tsa.statespace.sarimax import SARIMAX
+from statsmodels.tsa.holtwinters import ExponentialSmoothing
+import warnings
+import json
+import os
+from datetime import datetime
+import pickle
+warnings.filterwarnings("ignore")
+def create_sequences(features, targets, seq_length=20):
+    """Create sequences for time series prediction"""
+    X, y = [], []
+    for i in range(len(features) - seq_length):
+        X.append(features[i : i + seq_length].flatten())  # Flatten sequence
+        y.append(targets[i + seq_length])  # Predict next value
+    return np.array(X), np.array(y)
+def create_lagged_features(df, target_col, lags=[1, 2, 3, 5, 10, 20]):
+    """Create lagged features for time series"""
+    df_lagged = df.copy()
+    for lag in lags:
+        df_lagged[f"{target_col}_lag_{lag}"] = df_lagged[target_col].shift(lag)
+    # Add rolling statistics
+    for window in [5, 10, 20]:
+        df_lagged[f"{target_col}_rolling_mean_{window}"] = (
+            df_lagged[target_col].rolling(window).mean()
+        )
+        df_lagged[f"{target_col}_rolling_std_{window}"] = (
+            df_lagged[target_col].rolling(window).std()
+        )
+    # Drop NaN values created by lagging
+    df_lagged = df_lagged.dropna()
+    return df_lagged
+class ModelTrainer:
+    def __init__(self, model_name, model, save_dir="./checkpoints_classical"):
+        self.model_name = model_name
+        self.model = model
+        self.save_dir = save_dir
+        self.metrics = {}
+        self.predictions = None
+    def train(self, X_train, y_train):
+        """Train the model"""
+        print(f"\nTraining {self.model_name}...")
+        self.model.fit(X_train, y_train)
+    def predict(self, X):
+        """Make predictions"""
+        return self.model.predict(X)
+    def evaluate(self, X_train, y_train, X_val, y_val):
+        """Evaluate model on train and validation sets"""
+        train_pred = self.predict(X_train)
+        val_pred = self.predict(X_val)
+        self.metrics = {
+            "train_mse": mean_squared_error(y_train, train_pred),
+            "train_rmse": np.sqrt(mean_squared_error(y_train, train_pred)),
+            "train_mae": mean_absolute_error(y_train, train_pred),
+            "train_r2": r2_score(y_train, train_pred),
+            "val_mse": mean_squared_error(y_val, val_pred),
+            "val_rmse": np.sqrt(mean_squared_error(y_val, val_pred)),
+            "val_mae": mean_absolute_error(y_val, val_pred),
+            "val_r2": r2_score(y_val, val_pred),
+        }
+        self.predictions = {"train": train_pred, "val": val_pred}
+        return self.metrics
+    def save_model(self, run_dir):
+        """Save model to disk"""
+        model_path = os.path.join(run_dir, f"{self.model_name}_model.pkl")
+        with open(model_path, "wb") as f:
+            pickle.dump(self.model, f)
+        print(f"✓ Saved {self.model_name} model")
+class ARIMAModel:
+    def __init__(self, order=(1, 1, 1)):
+        self.order = order
+        self.model = None
+        self.model_fit = None
+    def fit(self, X_train, y_train):
+        """Fit ARIMA model - uses only target variable"""
+        # ARIMA works on univariate time series
+        self.model = ARIMA(y_train, order=self.order)
+        self.model_fit = self.model.fit()
+    def predict(self, X):
+        """Make predictions"""
+        n_periods = len(X)
+        forecast = self.model_fit.forecast(steps=n_periods)
+        return np.array(forecast)
+class SARIMAXModel:
+    def __init__(self, order=(1, 1, 1), seasonal_order=(0, 0, 0, 0)):
+        self.order = order
+        self.seasonal_order = seasonal_order
+        self.model = None
+        self.model_fit = None
+    def fit(self, X_train, y_train):
+        """Fit SARIMAX model"""
+        self.model = SARIMAX(
+            y_train, order=self.order, seasonal_order=self.seasonal_order
+        )
+        self.model_fit = self.model.fit(disp=False)
+    def predict(self, X):
+        """Make predictions"""
+        n_periods = len(X)
+        forecast = self.model_fit.forecast(steps=n_periods)
+        return np.array(forecast)
+class ExponentialSmoothingModel:
+    def __init__(self, seasonal_periods=None):
+        self.seasonal_periods = seasonal_periods
+        self.model = None
+        self.model_fit = None
+    def fit(self, X_train, y_train):
+        """Fit Exponential Smoothing model"""
+        self.model = ExponentialSmoothing(
+            y_train,
+            seasonal_periods=self.seasonal_periods,
+            trend="add",
+            seasonal="add" if self.seasonal_periods else None,
+        )
+        self.model_fit = self.model.fit()
+    def predict(self, X):
+        """Make predictions"""
+        n_periods = len(X)
+        forecast = self.model_fit.forecast(steps=n_periods)
+        return np.array(forecast)
+def get_ml_models():
+    """Get dictionary of classical ML models"""
+    models = {
+        # Linear Models
+        "Ridge": Ridge(alpha=1.0),
+        "Lasso": Lasso(alpha=0.1),
+        "ElasticNet": ElasticNet(alpha=0.1, l1_ratio=0.5),
+        "RandomForest": RandomForestRegressor(
+            n_estimators=100,
+            max_depth=10,
+            min_samples_split=5,
+            random_state=42,
+            n_jobs=-1,
+        ),
+        "ExtraTrees": ExtraTreesRegressor(
+            n_estimators=100,
+            max_depth=10,
+            min_samples_split=5,
+            random_state=42,
+            n_jobs=-1,
+        ),
+        "GradientBoosting": GradientBoostingRegressor(
+            n_estimators=100, max_depth=5, learning_rate=0.1, random_state=42
+        ),
+        "XGBoost": xgb.XGBRegressor(
+            n_estimators=100, max_depth=5, learning_rate=0.1, random_state=42, n_jobs=-1
+        ),
+        "LightGBM": lgb.LGBMRegressor(
+            n_estimators=100,
+            max_depth=5,
+            learning_rate=0.1,
+            random_state=42,
+            n_jobs=-1,
+            verbose=-1,
+        ),
+        "SVR": SVR(kernel="rbf", C=1.0, epsilon=0.1),
+    }
+    return models
+def get_time_series_models():
+    """Get dictionary of time series models"""
+    models = {
+        "ARIMA": ARIMAModel(order=(2, 1, 2)),
+        "SARIMAX": SARIMAXModel(order=(1, 1, 1), seasonal_order=(1, 1, 1, 5)),
+        "ExpSmoothing": ExponentialSmoothingModel(seasonal_periods=5),
+    }
+    return models
+def train_ml_models(X_train, y_train, X_val, y_val, save_dir):
+    """Train all classical ML models"""
+    models = get_ml_models()
+    results = {}
+    trained_models = {}
+    print("\n" + "=" * 60)
+    print("TRAINING CLASSICAL ML MODELS")
+    print("=" * 60)
+    for name, model in models.items():
+        try:
+            trainer = ModelTrainer(name, model, save_dir)
+            trainer.train(X_train, y_train)
+            metrics = trainer.evaluate(X_train, y_train, X_val, y_val)
+            trainer.save_model(save_dir)
+            results[name] = metrics
+            trained_models[name] = trainer
+            print(f"\n{name}:")
+            print(
+                f"  Train - RMSE: {metrics['train_rmse']:.6f}, MAE: {metrics['train_mae']:.6f}, R²: {metrics['train_r2']:.4f}"
+            )
+            print(
+                f"  Val   - RMSE: {metrics['val_rmse']:.6f}, MAE: {metrics['val_mae']:.6f}, R²: {metrics['val_r2']:.4f}"
+            )
+        except Exception as e:
+            print(f"\n{name}: FAILED - {str(e)}")
+            results[name] = None
+    return results, trained_models
+def train_time_series_models(y_train, y_val, save_dir):
+    """Train time series models (univariate)"""
+    models = get_time_series_models()
+    results = {}
+    trained_models = {}
+    print("\n" + "=" * 60)
+    print("TRAINING TIME SERIES MODELS")
+    print("=" * 60)
+    for name, model in models.items():
+        try:
+            trainer = ModelTrainer(name, model, save_dir)
+            # Time series models use only target variable
+            trainer.train(None, y_train)
+            # Make predictions
+            train_pred = trainer.predict(np.arange(len(y_train)))
+            val_pred = trainer.predict(np.arange(len(y_val)))
+            # Calculate metrics
+            metrics = {
+                "train_mse": mean_squared_error(y_train, train_pred),
+                "train_rmse": np.sqrt(mean_squared_error(y_train, train_pred)),
+                "train_mae": mean_absolute_error(y_train, train_pred),
+                "train_r2": r2_score(y_train, train_pred),
+                "val_mse": mean_squared_error(y_val, val_pred),
+                "val_rmse": np.sqrt(mean_squared_error(y_val, val_pred)),
+                "val_mae": mean_absolute_error(y_val, val_pred),
+                "val_r2": r2_score(y_val, val_pred),
+            }
+            trainer.metrics = metrics
+            trainer.predictions = {"train": train_pred, "val": val_pred}
+            trainer.save_model(save_dir)
+            results[name] = metrics
+            trained_models[name] = trainer
+            print(f"\n{name}:")
+            print(
+                f"  Train - RMSE: {metrics['train_rmse']:.6f}, MAE: {metrics['train_mae']:.6f}, R²: {metrics['train_r2']:.4f}"
+            )
+            print(
+                f"  Val   - RMSE: {metrics['val_rmse']:.6f}, MAE: {metrics['val_mae']:.6f}, R²: {metrics['val_r2']:.4f}"
+            )
+        except Exception as e:
+            print(f"\n{name}: FAILED - {str(e)}")
+            results[name] = None
+    return results, trained_models
+def plot_model_comparison(results, save_dir):
+    """Plot comparison of all models"""
+    # Filter out failed models
+    results = {k: v for k, v in results.items() if v is not None}
+    if not results:
+        print("No successful models to plot")
+        return
+    models = list(results.keys())
+    # Extract metrics
+    train_rmse = [results[m]["train_rmse"] for m in models]
+    val_rmse = [results[m]["val_rmse"] for m in models]
+    train_mae = [results[m]["train_mae"] for m in models]
+    val_mae = [results[m]["val_mae"] for m in models]
+    train_r2 = [results[m]["train_r2"] for m in models]
+    val_r2 = [results[m]["val_r2"] for m in models]
+    # Create comparison plots
+    fig, axes = plt.subplots(2, 2, figsize=(16, 12))
+    # RMSE comparison
+    ax = axes[0, 0]
+    x = np.arange(len(models))
+    width = 0.35
+    ax.bar(x - width / 2, train_rmse, width, label="Train", alpha=0.8)
+    ax.bar(x + width / 2, val_rmse, width, label="Validation", alpha=0.8)
+    ax.set_xlabel("Model")
+    ax.set_ylabel("RMSE")
+    ax.set_title("Root Mean Squared Error Comparison")
+    ax.set_xticks(x)
+    ax.set_xticklabels(models, rotation=45, ha="right")
+    ax.legend()
+    ax.grid(True, alpha=0.3)
+    # MAE comparison
+    ax = axes[0, 1]
+    ax.bar(x - width / 2, train_mae, width, label="Train", alpha=0.8)
+    ax.bar(x + width / 2, val_mae, width, label="Validation", alpha=0.8)
+    ax.set_xlabel("Model")
+    ax.set_ylabel("MAE")
+    ax.set_title("Mean Absolute Error Comparison")
+    ax.set_xticks(x)
+    ax.set_xticklabels(models, rotation=45, ha="right")
+    ax.legend()
+    ax.grid(True, alpha=0.3)
+    # R² comparison
+    ax = axes[1, 0]
+    ax.bar(x - width / 2, train_r2, width, label="Train", alpha=0.8)
+    ax.bar(x + width / 2, val_r2, width, label="Validation", alpha=0.8)
+    ax.set_xlabel("Model")
+    ax.set_ylabel("R² Score")
+    ax.set_title("R² Score Comparison")
+    ax.set_xticks(x)
+    ax.set_xticklabels(models, rotation=45, ha="right")
+    ax.legend()
+    ax.grid(True, alpha=0.3)
+    # Validation RMSE sorted
+    ax = axes[1, 1]
+    sorted_idx = np.argsort(val_rmse)
+    sorted_models = [models[i] for i in sorted_idx]
+    sorted_rmse = [val_rmse[i] for i in sorted_idx]
+    colors = plt.cm.RdYlGn_r(np.linspace(0.3, 0.9, len(sorted_models)))
+    ax.barh(range(len(sorted_models)), sorted_rmse, color=colors)
+    ax.set_yticks(range(len(sorted_models)))
+    ax.set_yticklabels(sorted_models)
+    ax.set_xlabel("Validation RMSE")
+    ax.set_title("Models Ranked by Validation RMSE")
+    ax.grid(True, alpha=0.3, axis="x")
+    plt.tight_layout()
+    plt.savefig(
+        os.path.join(save_dir, "model_comparison.png"), dpi=300, bbox_inches="tight"
+    )
+    print(f"\n✓ Saved model comparison plot")
+    plt.close()
+def plot_predictions_comparison(trained_models, y_val, save_dir, n_samples=200):
+    """Plot predictions from top models"""
+    # Get top 5 models by validation RMSE
+    model_scores = [
+        (name, trainer.metrics["val_rmse"])
+        for name, trainer in trained_models.items()
+        if trainer.metrics is not None
+    ]
+    model_scores.sort(key=lambda x: x[1])
+    top_models = model_scores[:5]
+    fig, axes = plt.subplots(len(top_models), 1, figsize=(14, 4 * len(top_models)))
+    if len(top_models) == 1:
+        axes = [axes]
+    plot_len = min(n_samples, len(y_val))
+    for i, (name, score) in enumerate(top_models):
+        ax = axes[i]
+        trainer = trained_models[name]
+        val_pred = trainer.predictions["val"]
+        ax.plot(y_val[:plot_len], label="Actual", alpha=0.7, linewidth=1.5)
+        ax.plot(val_pred[:plot_len], label="Predicted", alpha=0.7, linewidth=1.5)
+        ax.set_xlabel("Time Step")
+        ax.set_ylabel("Value")
+        ax.set_title(f"{name} Predictions (Val RMSE: {score:.6f})")
+        ax.legend()
+        ax.grid(True, alpha=0.3)
+    plt.tight_layout()
+    plt.savefig(
+        os.path.join(save_dir, "top_model_predictions.png"),
+        dpi=300,
+        bbox_inches="tight",
+    )
+    print(f"✓ Saved top model predictions plot")
+    plt.close()
+def create_results_table(results, save_dir):
+    """Create and save results table"""
+    # Filter out failed models
+    results = {k: v for k, v in results.items() if v is not None}
+    df = pd.DataFrame(results).T
+    df = df.sort_values("val_rmse")
+    print("\n" + "=" * 80)
+    print("MODEL COMPARISON RESULTS (sorted by validation RMSE)")
+    print("=" * 80)
+    print(df.to_string())
+    print("=" * 80)
+    # Save to CSV
+    df.to_csv(os.path.join(save_dir, "results_comparison.csv"))
+    print(f"\n✓ Saved results table")
+    return df
+# ========================= ABLATION STUDIES =========================
+def run_ablation_study(X_train, y_train, X_val, y_val, save_dir):
+    """Run ablation studies on feature importance and model configurations"""
+    print("\n" + "=" * 60)
+    print("ABLATION STUDY: Feature Importance")
+    print("=" * 60)
+    # Train a Random Forest to get feature importances
+    rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
+    rf_model.fit(X_train, y_train)
+    # Get feature importances
+    importances = rf_model.feature_importances_
+    # Test with different number of features
+    n_features_list = [10, 20, 50, 100, X_train.shape[1]]
+    ablation_results = {}
+    for n_features in n_features_list:
+        if n_features > X_train.shape[1]:
+            continue
+        # Select top n features
+        top_indices = np.argsort(importances)[-n_features:]
+        X_train_subset = X_train[:, top_indices]
+        X_val_subset = X_val[:, top_indices]
+        # Train model with subset
+        model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
+        model.fit(X_train_subset, y_train)
+        val_pred = model.predict(X_val_subset)
+        rmse = np.sqrt(mean_squared_error(y_val, val_pred))
+        mae = mean_absolute_error(y_val, val_pred)
+        r2 = r2_score(y_val, val_pred)
+        ablation_results[f"Top_{n_features}_features"] = {
+            "val_rmse": rmse,
+            "val_mae": mae,
+            "val_r2": r2,
+        }
+        print(
+            f"\nTop {n_features} features: RMSE={rmse:.6f}, MAE={mae:.6f}, R²={r2:.4f}"
+        )
+    # Save ablation results
+    ablation_df = pd.DataFrame(ablation_results).T
+    ablation_df.to_csv(os.path.join(save_dir, "ablation_feature_importance.csv"))
+    # Plot ablation results
+    fig, ax = plt.subplots(figsize=(10, 6))
+    x = range(len(ablation_results))
+    ax.plot(
+        list(ablation_results.keys()),
+        [v["val_rmse"] for v in ablation_results.values()],
+        "o-",
+        linewidth=2,
+        markersize=8,
+    )
+    ax.set_xlabel("Number of Features")
+    ax.set_ylabel("Validation RMSE")
+    ax.set_title("Ablation Study: Impact of Feature Count on Performance")
+    ax.grid(True, alpha=0.3)
+    plt.xticks(rotation=45, ha="right")
+    plt.tight_layout()
+    plt.savefig(os.path.join(save_dir, "ablation_feature_importance.png"), dpi=300)
+    plt.close()
+    print(f"\n✓ Saved ablation study results")
+    return ablation_results
+# ========================= MAIN EXECUTION =========================
+def main():
+    from data_prep.data_clean import clean_indicator
+    from data_prep.data_load import prepare_data
+    # Configuration
+    config = {
+        "data_path": "/home/aman/code/ml_fr/ml_stocks/data/NIFTY_5_years.csv",
+        "seq_length": 20,
+        "train_split": 0.8,
+        "save_dir": "./checkpoints_classical",
+        "target_col": "Daily_Return",
+    }
+    # Create save directory
+    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+    save_dir = os.path.join(config["save_dir"], f"run_{timestamp}")
+    os.makedirs(save_dir, exist_ok=True)
+    print(f"\n{'='*60}")
+    print(f"CLASSICAL ML & TIME SERIES MODEL TRAINING")
+    print(f"{'='*60}")
+    print(f"Save directory: {save_dir}")
+    print(f"{'='*60}\n")
+    # Load and prepare data
+    print("Loading data...")
+    load_df = prepare_data(config["data_path"])
+    df = clean_indicator(load_df)
+    target_col = config["target_col"]
+    feature_cols = [col for col in df.columns if col != target_col]
+    # Split data
+    train_size = int(len(df) * config["train_split"])
+    train_df = df[:train_size]
+    val_df = df[train_size:]
+    print(f"Train samples: {len(train_df)}")
+    print(f"Validation samples: {len(val_df)}")
+    print(f"Number of features: {len(feature_cols)}")
+    # Prepare features for ML models (with sequences)
+    scaler = StandardScaler()
+    train_features = scaler.fit_transform(train_df[feature_cols].values)
+    val_features = scaler.transform(val_df[feature_cols].values)
+    train_targets = train_df[target_col].values
+    val_targets = val_df[target_col].values
+    # Create sequences
+    X_train, y_train = create_sequences(
+        train_features, train_targets, config["seq_length"]
+    )
+    X_val, y_val = create_sequences(val_features, val_targets, config["seq_length"])
+    print(f"\nSequence shape: {X_train.shape}")
+    print(f"Target shape: {y_train.shape}")
+    # Save config
+    with open(os.path.join(save_dir, "config.json"), "w") as f:
+        json.dump(config, f, indent=4)
+    # Train ML models
+    ml_results, ml_models = train_ml_models(X_train, y_train, X_val, y_val, save_dir)
+    # Train time series models (using non-sequenced data)
+    ts_results, ts_models = train_time_series_models(
+        train_targets[config["seq_length"] :],  # Align with ML model targets
+        val_targets[config["seq_length"] :],
+        save_dir,
+    )
+    # Combine results
+    all_results = {**ml_results, **ts_results}
+    all_models = {**ml_models, **ts_models}
+    # Create visualizations
+    print("\n" + "=" * 60)
+    print("CREATING VISUALIZATIONS")
+    print("=" * 60)
+    plot_model_comparison(all_results, save_dir)
+    plot_predictions_comparison(all_models, y_val, save_dir)
+    results_df = create_results_table(all_results, save_dir)
+    # Run ablation study
+    ablation_results = run_ablation_study(X_train, y_train, X_val, y_val, save_dir)
+    print(f"\n{'='*60}")
+    print("TRAINING COMPLETE!")
+    print(f"Results saved to: {save_dir}")
+    print(f"{'='*60}\n")
+    # Print best model
+    best_model = results_df.index[0]
+    best_rmse = results_df.loc[best_model, "val_rmse"]
+    print(f"🏆 Best Model: {best_model}")
+    print(f"   Validation RMSE: {best_rmse:.6f}")
+    print(f"   Validation MAE: {results_df.loc[best_model, 'val_mae']:.6f}")
+    print(f"   Validation R²: {results_df.loc[best_model, 'val_r2']:.4f}")
+    return all_results, all_models, save_dir
+if __name__ == "__main__":
+    results, models, save_dir = main()
+    print("\n" + "=" * 60)
+    print("All models trained successfully!")
+    print("Check the save directory for:")
+    print("  - Model comparison plots")
+    print("  - Results CSV")
+    print("  - Saved model files (.pkl)")
+    print("  - Ablation study results")
+    print("=" * 60)