File size: 30,063 Bytes

7f90ea0

"""

Demand Prediction System - Model Training Script



This script trains multiple machine learning and time-series models to predict 

product demand (sales quantity) for an e-commerce platform.



Features:

- Data preprocessing and feature engineering

- Date feature extraction (day, month, day_of_week, weekend)

- Categorical encoding

- Feature scaling

- Multiple ML models (Linear Regression, Random Forest, XGBoost)

- Time-series models (ARIMA, Prophet)

- Model evaluation (MAE, RMSE, R2 Score)

- Automatic best model selection

- Model persistence using joblib

- Visualization of results

- Comparison between ML and time-series approaches

"""

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import joblib
import os
import warnings
warnings.filterwarnings('ignore')

# Machine Learning imports
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Try to import XGBoost (optional)
try:
    import xgboost as xgb
    XGBOOST_AVAILABLE = True
except ImportError:
    XGBOOST_AVAILABLE = False
    print("XGBoost not available. Install with: pip install xgboost")

# Try to import time-series libraries
try:
    from statsmodels.tsa.arima.model import ARIMA
    from statsmodels.tsa.stattools import adfuller
    ARIMA_AVAILABLE = True
except ImportError:
    ARIMA_AVAILABLE = False
    print("statsmodels not available. Install with: pip install statsmodels")

try:
    from prophet import Prophet
    PROPHET_AVAILABLE = True
except ImportError:
    PROPHET_AVAILABLE = False
    print("Prophet not available. Install with: pip install prophet")

# Set random seeds for reproducibility
np.random.seed(42)

# Configuration
DATA_PATH = 'data/sales.csv'
MODEL_DIR = 'models'
PLOTS_DIR = 'plots'

# Create directories if they don't exist
os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(PLOTS_DIR, exist_ok=True)


def load_data(file_path):
    """

    Load the sales dataset from CSV file.

    

    Args:

        file_path: Path to the CSV file

        

    Returns:

        DataFrame: Loaded dataset

    """
    print(f"Loading data from {file_path}...")
    df = pd.read_csv(file_path)
    print(f"Data loaded successfully! Shape: {df.shape}")
    return df


def preprocess_data(df):
    """

    Preprocess the data: convert date, extract features, handle missing values.

    

    Args:

        df: Raw DataFrame

        

    Returns:

        DataFrame: Preprocessed DataFrame

    """
    print("\n" + "="*60)
    print("PREPROCESSING DATA")
    print("="*60)
    
    # Create a copy to avoid modifying original
    df = df.copy()
    
    # Convert date column to datetime
    df['date'] = pd.to_datetime(df['date'])
    
    # Extract date features
    print("Extracting date features...")
    df['day'] = df['date'].dt.day
    df['month'] = df['date'].dt.month
    df['day_of_week'] = df['date'].dt.dayofweek  # 0=Monday, 6=Sunday
    df['weekend'] = (df['day_of_week'] >= 5).astype(int)  # 1 if weekend, 0 otherwise
    df['year'] = df['date'].dt.year
    df['quarter'] = df['date'].dt.quarter
    
    # Check for missing values
    print("\nMissing values:")
    missing = df.isnull().sum()
    print(missing[missing > 0])
    
    if missing.sum() > 0:
        print("Filling missing values...")
        df = df.fillna(df.median(numeric_only=True))
    
    # Display basic statistics
    print("\nDataset Info:")
    print(f"Shape: {df.shape}")
    print(f"\nColumns: {df.columns.tolist()}")
    print(f"\nData types:\n{df.dtypes}")
    print(f"\nBasic statistics:\n{df.describe()}")
    
    return df


def feature_engineering(df):
    """

    Perform feature engineering: encode categorical variables, scale features.

    

    Args:

        df: Preprocessed DataFrame

        

    Returns:

        tuple: (X_features, y_target, feature_names, encoders, scaler)

    """
    print("\n" + "="*60)
    print("FEATURE ENGINEERING")
    print("="*60)
    
    # Separate features and target
    # Drop original date column (we have extracted features from it)
    # Keep product_id for now (we'll encode it)
    feature_columns = ['product_id', 'price', 'discount', 'category', 
                       'day', 'month', 'day_of_week', 'weekend', 'year', 'quarter']
    
    X = df[feature_columns].copy()
    y = df['sales_quantity'].copy()
    
    # Encode categorical variables
    print("Encoding categorical variables...")
    
    # Label encode category
    category_encoder = LabelEncoder()
    X['category_encoded'] = category_encoder.fit_transform(X['category'])
    
    # Label encode product_id (treating it as categorical)
    product_encoder = LabelEncoder()
    X['product_id_encoded'] = product_encoder.fit_transform(X['product_id'])
    
    # Drop original categorical columns
    X = X.drop(['category', 'product_id'], axis=1)
    
    # Get feature names
    feature_names = X.columns.tolist()
    
    print(f"Features after encoding: {feature_names}")
    print(f"Number of features: {len(feature_names)}")
    
    # Scale numerical features
    print("\nScaling numerical features...")
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    X_scaled = pd.DataFrame(X_scaled, columns=feature_names)
    
    # Store encoders and scaler for later use
    encoders = {
        'category': category_encoder,
        'product_id': product_encoder,
        'scaler': scaler
    }
    
    return X_scaled, y, feature_names, encoders, scaler


def train_models(X_train, y_train, X_val, y_val):
    """

    Train multiple models and return their performance metrics.

    

    Args:

        X_train: Training features

        y_train: Training target

        X_val: Validation features

        y_val: Validation target

        

    Returns:

        dict: Dictionary containing models and their metrics

    """
    print("\n" + "="*60)
    print("TRAINING MODELS")
    print("="*60)
    
    models = {}
    results = {}
    
    # 1. Linear Regression
    print("\n1. Training Linear Regression...")
    lr_model = LinearRegression()
    lr_model.fit(X_train, y_train)
    lr_pred = lr_model.predict(X_val)
    
    lr_mae = mean_absolute_error(y_val, lr_pred)
    lr_rmse = np.sqrt(mean_squared_error(y_val, lr_pred))
    lr_r2 = r2_score(y_val, lr_pred)
    
    models['Linear Regression'] = lr_model
    results['Linear Regression'] = {
        'model': lr_model,
        'mae': lr_mae,
        'rmse': lr_rmse,
        'r2': lr_r2,
        'predictions': lr_pred
    }
    
    print(f"   MAE: {lr_mae:.2f}, RMSE: {lr_rmse:.2f}, R2: {lr_r2:.4f}")
    
    # 2. Random Forest Regressor
    print("\n2. Training Random Forest Regressor...")
    rf_model = RandomForestRegressor(
        n_estimators=100,
        max_depth=15,
        min_samples_split=5,
        min_samples_leaf=2,
        random_state=42,
        n_jobs=-1
    )
    rf_model.fit(X_train, y_train)
    rf_pred = rf_model.predict(X_val)
    
    rf_mae = mean_absolute_error(y_val, rf_pred)
    rf_rmse = np.sqrt(mean_squared_error(y_val, rf_pred))
    rf_r2 = r2_score(y_val, rf_pred)
    
    models['Random Forest'] = rf_model
    results['Random Forest'] = {
        'model': rf_model,
        'mae': rf_mae,
        'rmse': rf_rmse,
        'r2': rf_r2,
        'predictions': rf_pred
    }
    
    print(f"   MAE: {rf_mae:.2f}, RMSE: {rf_rmse:.2f}, R2: {rf_r2:.4f}")
    
    # 3. XGBoost (if available)
    if XGBOOST_AVAILABLE:
        print("\n3. Training XGBoost Regressor...")
        xgb_model = xgb.XGBRegressor(
            n_estimators=100,
            max_depth=6,
            learning_rate=0.1,
            random_state=42,
            n_jobs=-1
        )
        xgb_model.fit(X_train, y_train)
        xgb_pred = xgb_model.predict(X_val)
        
        xgb_mae = mean_absolute_error(y_val, xgb_pred)
        xgb_rmse = np.sqrt(mean_squared_error(y_val, xgb_pred))
        xgb_r2 = r2_score(y_val, xgb_pred)
        
        models['XGBoost'] = xgb_model
        results['XGBoost'] = {
            'model': xgb_model,
            'mae': xgb_mae,
            'rmse': xgb_rmse,
            'r2': xgb_r2,
            'predictions': xgb_pred
        }
        
        print(f"   MAE: {xgb_mae:.2f}, RMSE: {xgb_rmse:.2f}, R2: {xgb_r2:.4f}")
    else:
        print("\n3. XGBoost skipped (not available)")
    
    return results


def prepare_time_series_data(df):
    """

    Prepare time-series data by aggregating daily sales.

    

    Args:

        df: DataFrame with date and sales_quantity columns

        

    Returns:

        tuple: (ts_data, train_size) - time series data and training size

    """
    print("\n" + "="*60)
    print("PREPARING TIME-SERIES DATA")
    print("="*60)
    
    # Aggregate by date
    df['date'] = pd.to_datetime(df['date'])
    ts_data = df.groupby('date')['sales_quantity'].sum().reset_index()
    ts_data = ts_data.sort_values('date').reset_index(drop=True)
    ts_data.columns = ['ds', 'y']  # Prophet expects 'ds' and 'y'
    
    print(f"Time-series data shape: {ts_data.shape}")
    print(f"Date range: {ts_data['ds'].min()} to {ts_data['ds'].max()}")
    print(f"Total days: {len(ts_data)}")
    
    # Use 80% for training (chronological split for time-series)
    train_size = int(len(ts_data) * 0.8)
    
    return ts_data, train_size


def train_arima(ts_data, train_size):
    """

    Train ARIMA model on time-series data.

    

    Args:

        ts_data: Time-series DataFrame with 'ds' and 'y' columns

        train_size: Number of samples for training

        

    Returns:

        dict: Model results dictionary

    """
    if not ARIMA_AVAILABLE:
        return None
    
    print("\n" + "="*60)
    print("TRAINING ARIMA MODEL")
    print("="*60)
    
    try:
        # Split data chronologically
        train_data = ts_data['y'].iloc[:train_size].values
        val_data = ts_data['y'].iloc[train_size:].values
        val_dates = ts_data['ds'].iloc[train_size:].values
        
        print(f"Training on {len(train_data)} samples")
        print(f"Validating on {len(val_data)} samples")
        
        # Try different ARIMA orders (p, d, q)
        # Start with auto_arima-like approach - try common orders
        best_aic = np.inf
        best_order = None
        best_model = None
        
        # Common ARIMA orders to try
        orders_to_try = [
            (1, 1, 1),  # Standard ARIMA(1,1,1)
            (2, 1, 2),  # ARIMA(2,1,2)
            (1, 1, 0),  # ARIMA(1,1,0) - AR model
            (0, 1, 1),  # ARIMA(0,1,1) - MA model
            (2, 1, 1),  # ARIMA(2,1,1)
            (1, 1, 2),  # ARIMA(1,1,2)
        ]
        
        print("Trying different ARIMA orders...")
        for order in orders_to_try:
            try:
                model = ARIMA(train_data, order=order)
                fitted_model = model.fit()
                aic = fitted_model.aic
                
                if aic < best_aic:
                    best_aic = aic
                    best_order = order
                    best_model = fitted_model
                    print(f"   Order {order}: AIC = {aic:.2f} (best so far)")
                else:
                    print(f"   Order {order}: AIC = {aic:.2f}")
            except Exception as e:
                print(f"   Order {order}: Failed - {str(e)[:50]}")
                continue
        
        if best_model is None:
            print("Failed to fit ARIMA model with any order")
            return None
        
        print(f"\nBest ARIMA order: {best_order} (AIC: {best_aic:.2f})")
        
        # Make predictions
        forecast_steps = len(val_data)
        forecast = best_model.forecast(steps=forecast_steps)
        
        # Ensure predictions are non-negative
        forecast = np.maximum(forecast, 0)
        
        # Calculate metrics
        mae = mean_absolute_error(val_data, forecast)
        rmse = np.sqrt(mean_squared_error(val_data, forecast))
        r2 = r2_score(val_data, forecast)
        
        print(f"   MAE: {mae:.2f}, RMSE: {rmse:.2f}, R2: {r2:.4f}")
        
        return {
            'model': best_model,
            'order': best_order,
            'mae': mae,
            'rmse': rmse,
            'r2': r2,
            'predictions': forecast,
            'actual': val_data,
            'dates': val_dates
        }
        
    except Exception as e:
        print(f"Error training ARIMA: {str(e)}")
        return None


def train_prophet(ts_data, train_size):
    """

    Train Prophet model on time-series data.

    

    Args:

        ts_data: Time-series DataFrame with 'ds' and 'y' columns

        train_size: Number of samples for training

        

    Returns:

        dict: Model results dictionary

    """
    if not PROPHET_AVAILABLE:
        return None
    
    print("\n" + "="*60)
    print("TRAINING PROPHET MODEL")
    print("="*60)
    
    try:
        # Split data chronologically
        train_data = ts_data.iloc[:train_size].copy()
        val_data = ts_data.iloc[train_size:].copy()
        
        print(f"Training on {len(train_data)} samples")
        print(f"Validating on {len(val_data)} samples")
        
        # Initialize and fit Prophet model
        # Enable daily seasonality and weekly/yearly seasonality
        model = Prophet(
            daily_seasonality=False,  # Disable daily for daily data
            weekly_seasonality=True,
            yearly_seasonality=True,
            seasonality_mode='multiplicative',
            changepoint_prior_scale=0.05
        )
        
        print("Fitting Prophet model...")
        model.fit(train_data)
        
        # Create future dataframe for validation period
        future = model.make_future_dataframe(periods=len(val_data), freq='D')
        
        # Make predictions
        forecast = model.predict(future)
        
        # Get predictions for validation period
        val_forecast = forecast.iloc[train_size:]['yhat'].values
        val_actual = val_data['y'].values
        
        # Ensure predictions are non-negative
        val_forecast = np.maximum(val_forecast, 0)
        
        # Calculate metrics
        mae = mean_absolute_error(val_actual, val_forecast)
        rmse = np.sqrt(mean_squared_error(val_actual, val_forecast))
        r2 = r2_score(val_actual, val_forecast)
        
        print(f"   MAE: {mae:.2f}, RMSE: {rmse:.2f}, R2: {r2:.4f}")
        
        return {
            'model': model,
            'mae': mae,
            'rmse': rmse,
            'r2': r2,
            'predictions': val_forecast,
            'actual': val_actual,
            'dates': val_data['ds'].values,
            'full_forecast': forecast
        }
        
    except Exception as e:
        print(f"Error training Prophet: {str(e)}")
        import traceback
        traceback.print_exc()
        return None


def select_best_model(results):
    """

    Select the best model based on R2 score (higher is better).

    

    Args:

        results: Dictionary containing model results

        

    Returns:

        tuple: (best_model_name, best_model, best_metrics)

    """
    print("\n" + "="*60)
    print("MODEL COMPARISON")
    print("="*60)
    
    # Create comparison DataFrame
    comparison_data = []
    for model_name, metrics in results.items():
        comparison_data.append({
            'Model': model_name,
            'MAE': metrics['mae'],
            'RMSE': metrics['rmse'],
            'R2 Score': metrics['r2']
        })
    
    comparison_df = pd.DataFrame(comparison_data)
    print("\nModel Performance Comparison:")
    print(comparison_df.to_string(index=False))
    
    # Select best model based on R2 score
    best_model_name = max(results.keys(), key=lambda x: results[x]['r2'])
    best_model = results[best_model_name]['model']
    best_metrics = {
        'mae': results[best_model_name]['mae'],
        'rmse': results[best_model_name]['rmse'],
        'r2': results[best_model_name]['r2']
    }
    
    print(f"\n{'='*60}")
    print(f"BEST MODEL: {best_model_name}")
    print(f"MAE: {best_metrics['mae']:.2f}")
    print(f"RMSE: {best_metrics['rmse']:.2f}")
    print(f"R2 Score: {best_metrics['r2']:.4f}")
    print(f"{'='*60}")
    
    return best_model_name, best_model, best_metrics


def visualize_results(df, results, best_model_name, feature_names):
    """

    Create visualizations: demand trends, feature importance, model comparison.

    

    Args:

        df: Original DataFrame

        results: Model results dictionary

        best_model_name: Name of the best model

        feature_names: List of feature names

    """
    print("\n" + "="*60)
    print("GENERATING VISUALIZATIONS")
    print("="*60)
    
    # Set style
    sns.set_style("whitegrid")
    plt.rcParams['figure.figsize'] = (12, 6)
    
    # 1. Demand trends over time
    print("1. Plotting demand trends over time...")
    df['date'] = pd.to_datetime(df['date'])
    daily_demand = df.groupby('date')['sales_quantity'].sum().reset_index()
    
    plt.figure(figsize=(14, 6))
    plt.plot(daily_demand['date'], daily_demand['sales_quantity'], linewidth=1, alpha=0.7)
    plt.title('Total Daily Sales Quantity Over Time', fontsize=16, fontweight='bold')
    plt.xlabel('Date', fontsize=12)
    plt.ylabel('Total Sales Quantity', fontsize=12)
    plt.grid(True, alpha=0.3)
    plt.tight_layout()
    plt.savefig(f'{PLOTS_DIR}/demand_trends.png', dpi=300, bbox_inches='tight')
    print(f"   Saved: {PLOTS_DIR}/demand_trends.png")
    plt.close()
    
    # 2. Monthly average demand
    print("2. Plotting monthly average demand...")
    df['month_name'] = pd.to_datetime(df['date']).dt.strftime('%B')
    monthly_avg = df.groupby('month')['sales_quantity'].mean().reset_index()
    month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 
                   'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
    monthly_avg['month_name'] = monthly_avg['month'].apply(lambda x: month_names[x-1])
    
    plt.figure(figsize=(12, 6))
    plt.bar(monthly_avg['month_name'], monthly_avg['sales_quantity'], color='steelblue', alpha=0.7)
    plt.title('Average Sales Quantity by Month', fontsize=16, fontweight='bold')
    plt.xlabel('Month', fontsize=12)
    plt.ylabel('Average Sales Quantity', fontsize=12)
    plt.xticks(rotation=45)
    plt.grid(True, alpha=0.3, axis='y')
    plt.tight_layout()
    plt.savefig(f'{PLOTS_DIR}/monthly_demand.png', dpi=300, bbox_inches='tight')
    print(f"   Saved: {PLOTS_DIR}/monthly_demand.png")
    plt.close()
    
    # 3. Feature importance (for tree-based models)
    print("3. Plotting feature importance...")
    best_model = results[best_model_name]['model']
    
    if hasattr(best_model, 'feature_importances_'):
        importances = best_model.feature_importances_
        feature_importance_df = pd.DataFrame({
            'feature': feature_names,
            'importance': importances
        }).sort_values('importance', ascending=False)
        
        plt.figure(figsize=(10, 6))
        plt.barh(feature_importance_df['feature'], feature_importance_df['importance'], color='coral', alpha=0.7)
        plt.title(f'Feature Importance - {best_model_name}', fontsize=16, fontweight='bold')
        plt.xlabel('Importance', fontsize=12)
        plt.ylabel('Feature', fontsize=12)
        plt.gca().invert_yaxis()
        plt.grid(True, alpha=0.3, axis='x')
        plt.tight_layout()
        plt.savefig(f'{PLOTS_DIR}/feature_importance.png', dpi=300, bbox_inches='tight')
        print(f"   Saved: {PLOTS_DIR}/feature_importance.png")
        plt.close()
    else:
        print("   Feature importance not available for this model type")
    
    # 4. Model comparison
    print("4. Plotting model comparison...")
    model_names = list(results.keys())
    mae_scores = [results[m]['mae'] for m in model_names]
    rmse_scores = [results[m]['rmse'] for m in model_names]
    r2_scores = [results[m]['r2'] for m in model_names]
    
    # Separate ML and time-series models for visualization
    ml_models = [m for m in model_names if m not in ['ARIMA', 'Prophet']]
    ts_models = [m for m in model_names if m in ['ARIMA', 'Prophet']]
    
    fig, axes = plt.subplots(1, 3, figsize=(18, 5))
    
    # Color code: ML models in blue tones, TS models in orange/red tones
    colors = []
    for m in model_names:
        if m in ts_models:
            colors.append('coral' if m == 'ARIMA' else 'salmon')
        else:
            colors.append('skyblue')
    
    # MAE comparison
    axes[0].bar(model_names, mae_scores, color=colors, alpha=0.7)
    axes[0].set_title('MAE Comparison (Lower is Better)', fontsize=14, fontweight='bold')
    axes[0].set_ylabel('MAE', fontsize=12)
    axes[0].tick_params(axis='x', rotation=45)
    axes[0].grid(True, alpha=0.3, axis='y')
    # Add legend
    from matplotlib.patches import Patch
    legend_elements = [
        Patch(facecolor='skyblue', alpha=0.7, label='ML Models'),
        Patch(facecolor='coral', alpha=0.7, label='Time-Series Models')
    ]
    axes[0].legend(handles=legend_elements, loc='upper right')
    
    # RMSE comparison
    axes[1].bar(model_names, rmse_scores, color=colors, alpha=0.7)
    axes[1].set_title('RMSE Comparison (Lower is Better)', fontsize=14, fontweight='bold')
    axes[1].set_ylabel('RMSE', fontsize=12)
    axes[1].tick_params(axis='x', rotation=45)
    axes[1].grid(True, alpha=0.3, axis='y')
    
    # R2 comparison
    axes[2].bar(model_names, r2_scores, color=colors, alpha=0.7)
    axes[2].set_title('R2 Score Comparison (Higher is Better)', fontsize=14, fontweight='bold')
    axes[2].set_ylabel('R2 Score', fontsize=12)
    axes[2].tick_params(axis='x', rotation=45)
    axes[2].grid(True, alpha=0.3, axis='y')
    
    plt.tight_layout()
    plt.savefig(f'{PLOTS_DIR}/model_comparison.png', dpi=300, bbox_inches='tight')
    print(f"   Saved: {PLOTS_DIR}/model_comparison.png")
    plt.close()
    
    # 5. Time-series predictions plot (if time-series models available)
    if ts_models:
        print("5. Plotting time-series model predictions...")
        fig, axes = plt.subplots(len(ts_models), 1, figsize=(14, 6*len(ts_models)))
        if len(ts_models) == 1:
            axes = [axes]
        
        for idx, model_name in enumerate(ts_models):
            if model_name in results and 'dates' in results[model_name]:
                dates = pd.to_datetime(results[model_name]['dates'])
                actual = results[model_name]['actual']
                predictions = results[model_name]['predictions']
                
                axes[idx].plot(dates, actual, label='Actual', linewidth=2, alpha=0.7)
                axes[idx].plot(dates, predictions, label='Predicted', linewidth=2, alpha=0.7, linestyle='--')
                axes[idx].set_title(f'{model_name} - Actual vs Predicted', fontsize=14, fontweight='bold')
                axes[idx].set_xlabel('Date', fontsize=12)
                axes[idx].set_ylabel('Sales Quantity', fontsize=12)
                axes[idx].legend()
                axes[idx].grid(True, alpha=0.3)
        
        plt.tight_layout()
        plt.savefig(f'{PLOTS_DIR}/timeseries_predictions.png', dpi=300, bbox_inches='tight')
        print(f"   Saved: {PLOTS_DIR}/timeseries_predictions.png")
        plt.close()
    
    print("   Visualization complete!")


def save_model(model, encoders, scaler, feature_names, best_model_name, best_metrics):
    """

    Save the trained model and preprocessing objects.

    

    Args:

        model: Trained model

        encoders: Dictionary of encoders

        scaler: Fitted scaler

        feature_names: List of feature names

        best_model_name: Name of the best model

        best_metrics: Dictionary of metrics

    """
    print("\n" + "="*60)
    print("SAVING MODEL")
    print("="*60)
    
    # Save model
    model_path = f'{MODEL_DIR}/best_model.joblib'
    joblib.dump(model, model_path)
    print(f"Model saved to: {model_path}")
    
    # Save encoders and scaler
    preprocessing_path = f'{MODEL_DIR}/preprocessing.joblib'
    preprocessing_data = {
        'encoders': encoders,
        'scaler': scaler,
        'feature_names': feature_names
    }
    joblib.dump(preprocessing_data, preprocessing_path)
    print(f"Preprocessing objects saved to: {preprocessing_path}")
    
    # Save model metadata
    metadata = {
        'model_name': best_model_name,
        'metrics': best_metrics,
        'feature_names': feature_names,
        'saved_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    }
    
    import json
    metadata_path = f'{MODEL_DIR}/model_metadata.json'
    with open(metadata_path, 'w') as f:
        json.dump(metadata, f, indent=4)
    print(f"Model metadata saved to: {metadata_path}")


def main():
    """

    Main function to orchestrate the training pipeline.

    """
    print("\n" + "="*60)
    print("DEMAND PREDICTION SYSTEM - MODEL TRAINING")
    print("ML Models vs Time-Series Models Comparison")
    print("="*60)
    
    # Step 1: Load data
    df = load_data(DATA_PATH)
    
    # Step 2: Preprocess data
    df_processed = preprocess_data(df)
    
    # Step 3: Feature engineering for ML models
    X, y, feature_names, encoders, scaler = feature_engineering(df_processed)
    
    # Step 4: Split data for ML models (random split)
    print("\n" + "="*60)
    print("SPLITTING DATA FOR ML MODELS")
    print("="*60)
    X_train, X_val, y_train, y_val = train_test_split(
        X, y, test_size=0.2, random_state=42
    )
    print(f"Training set: {X_train.shape[0]} samples")
    print(f"Validation set: {X_val.shape[0]} samples")
    
    # Step 5: Train ML models
    print("\n" + "="*70)
    print("TRAINING MACHINE LEARNING MODELS")
    print("="*70)
    results = train_models(X_train, y_train, X_val, y_val)
    
    # Step 6: Prepare time-series data
    ts_data, train_size = prepare_time_series_data(df_processed)
    
    # Step 7: Train time-series models
    print("\n" + "="*70)
    print("TRAINING TIME-SERIES MODELS")
    print("="*70)
    
    # Train ARIMA
    if ARIMA_AVAILABLE:
        arima_results = train_arima(ts_data, train_size)
        if arima_results:
            results['ARIMA'] = arima_results
    else:
        print("\nARIMA skipped (statsmodels not available)")
    
    # Train Prophet
    if PROPHET_AVAILABLE:
        prophet_results = train_prophet(ts_data, train_size)
        if prophet_results:
            results['Prophet'] = prophet_results
    else:
        print("\nProphet skipped (prophet not available)")
    
    # Step 8: Select best model (across all model types)
    best_model_name, best_model, best_metrics = select_best_model(results)
    
    # Step 9: Visualize results
    visualize_results(df_processed, results, best_model_name, feature_names)
    
    # Step 10: Save model (only ML models can be saved with preprocessing)
    # For time-series models, save separately
    if best_model_name not in ['ARIMA', 'Prophet']:
        save_model(best_model, encoders, scaler, feature_names, best_model_name, best_metrics)
    else:
        # Save time-series model separately
        print("\n" + "="*60)
        print("SAVING TIME-SERIES MODEL")
        print("="*60)
        ts_model_path = f'{MODEL_DIR}/best_timeseries_model.joblib'
        joblib.dump(best_model, ts_model_path)
        print(f"Time-series model saved to: {ts_model_path}")
        
        # Also save preprocessing for ML models (in case user wants to use them)
        preprocessing_path = f'{MODEL_DIR}/preprocessing.joblib'
        preprocessing_data = {
            'encoders': encoders,
            'scaler': scaler,
            'feature_names': feature_names
        }
        joblib.dump(preprocessing_data, preprocessing_path)
        print(f"ML preprocessing objects saved to: {preprocessing_path}")
    
    # Save all results metadata
    import json
    all_models_metadata = {
        'best_model': best_model_name,
        'best_metrics': best_metrics,
        'all_models': {}
    }
    for model_name, model_results in results.items():
        all_models_metadata['all_models'][model_name] = {
            'mae': model_results['mae'],
            'rmse': model_results['rmse'],
            'r2': model_results['r2']
        }
    all_models_metadata['saved_at'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    
    metadata_path = f'{MODEL_DIR}/all_models_metadata.json'
    with open(metadata_path, 'w') as f:
        json.dump(all_models_metadata, f, indent=4)
    print(f"All models metadata saved to: {metadata_path}")
    
    print("\n" + "="*60)
    print("TRAINING COMPLETE!")
    print("="*60)
    print(f"\nBest model: {best_model_name}")
    print(f"Model type: {'Time-Series' if best_model_name in ['ARIMA', 'Prophet'] else 'Machine Learning'}")
    print(f"Model saved to: {MODEL_DIR}/")
    print(f"Visualizations saved to: {PLOTS_DIR}/")
    print("\nYou can now use predict.py to make predictions!")


if __name__ == "__main__":
    main()