""" Demand Prediction System - Model Training Script This script trains multiple machine learning and time-series models to predict product demand (sales quantity) for an e-commerce platform. Features: - Data preprocessing and feature engineering - Date feature extraction (day, month, day_of_week, weekend) - Categorical encoding - Feature scaling - Multiple ML models (Linear Regression, Random Forest, XGBoost) - Time-series models (ARIMA, Prophet) - Model evaluation (MAE, RMSE, R2 Score) - Automatic best model selection - Model persistence using joblib - Visualization of results - Comparison between ML and time-series approaches """ import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from datetime import datetime import joblib import os import warnings warnings.filterwarnings('ignore') # Machine Learning imports from sklearn.model_selection import train_test_split from sklearn.preprocessing import StandardScaler, LabelEncoder from sklearn.linear_model import LinearRegression from sklearn.ensemble import RandomForestRegressor from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score # Try to import XGBoost (optional) try: import xgboost as xgb XGBOOST_AVAILABLE = True except ImportError: XGBOOST_AVAILABLE = False print("XGBoost not available. Install with: pip install xgboost") # Try to import time-series libraries try: from statsmodels.tsa.arima.model import ARIMA from statsmodels.tsa.stattools import adfuller ARIMA_AVAILABLE = True except ImportError: ARIMA_AVAILABLE = False print("statsmodels not available. Install with: pip install statsmodels") try: from prophet import Prophet PROPHET_AVAILABLE = True except ImportError: PROPHET_AVAILABLE = False print("Prophet not available. Install with: pip install prophet") # Set random seeds for reproducibility np.random.seed(42) # Configuration DATA_PATH = 'data/sales.csv' MODEL_DIR = 'models' PLOTS_DIR = 'plots' # Create directories if they don't exist os.makedirs(MODEL_DIR, exist_ok=True) os.makedirs(PLOTS_DIR, exist_ok=True) def load_data(file_path): """ Load the sales dataset from CSV file. Args: file_path: Path to the CSV file Returns: DataFrame: Loaded dataset """ print(f"Loading data from {file_path}...") df = pd.read_csv(file_path) print(f"Data loaded successfully! Shape: {df.shape}") return df def preprocess_data(df): """ Preprocess the data: convert date, extract features, handle missing values. Args: df: Raw DataFrame Returns: DataFrame: Preprocessed DataFrame """ print("\n" + "="*60) print("PREPROCESSING DATA") print("="*60) # Create a copy to avoid modifying original df = df.copy() # Convert date column to datetime df['date'] = pd.to_datetime(df['date']) # Extract date features print("Extracting date features...") df['day'] = df['date'].dt.day df['month'] = df['date'].dt.month df['day_of_week'] = df['date'].dt.dayofweek # 0=Monday, 6=Sunday df['weekend'] = (df['day_of_week'] >= 5).astype(int) # 1 if weekend, 0 otherwise df['year'] = df['date'].dt.year df['quarter'] = df['date'].dt.quarter # Check for missing values print("\nMissing values:") missing = df.isnull().sum() print(missing[missing > 0]) if missing.sum() > 0: print("Filling missing values...") df = df.fillna(df.median(numeric_only=True)) # Display basic statistics print("\nDataset Info:") print(f"Shape: {df.shape}") print(f"\nColumns: {df.columns.tolist()}") print(f"\nData types:\n{df.dtypes}") print(f"\nBasic statistics:\n{df.describe()}") return df def feature_engineering(df): """ Perform feature engineering: encode categorical variables, scale features. Args: df: Preprocessed DataFrame Returns: tuple: (X_features, y_target, feature_names, encoders, scaler) """ print("\n" + "="*60) print("FEATURE ENGINEERING") print("="*60) # Separate features and target # Drop original date column (we have extracted features from it) # Keep product_id for now (we'll encode it) feature_columns = ['product_id', 'price', 'discount', 'category', 'day', 'month', 'day_of_week', 'weekend', 'year', 'quarter'] X = df[feature_columns].copy() y = df['sales_quantity'].copy() # Encode categorical variables print("Encoding categorical variables...") # Label encode category category_encoder = LabelEncoder() X['category_encoded'] = category_encoder.fit_transform(X['category']) # Label encode product_id (treating it as categorical) product_encoder = LabelEncoder() X['product_id_encoded'] = product_encoder.fit_transform(X['product_id']) # Drop original categorical columns X = X.drop(['category', 'product_id'], axis=1) # Get feature names feature_names = X.columns.tolist() print(f"Features after encoding: {feature_names}") print(f"Number of features: {len(feature_names)}") # Scale numerical features print("\nScaling numerical features...") scaler = StandardScaler() X_scaled = scaler.fit_transform(X) X_scaled = pd.DataFrame(X_scaled, columns=feature_names) # Store encoders and scaler for later use encoders = { 'category': category_encoder, 'product_id': product_encoder, 'scaler': scaler } return X_scaled, y, feature_names, encoders, scaler def train_models(X_train, y_train, X_val, y_val): """ Train multiple models and return their performance metrics. Args: X_train: Training features y_train: Training target X_val: Validation features y_val: Validation target Returns: dict: Dictionary containing models and their metrics """ print("\n" + "="*60) print("TRAINING MODELS") print("="*60) models = {} results = {} # 1. Linear Regression print("\n1. Training Linear Regression...") lr_model = LinearRegression() lr_model.fit(X_train, y_train) lr_pred = lr_model.predict(X_val) lr_mae = mean_absolute_error(y_val, lr_pred) lr_rmse = np.sqrt(mean_squared_error(y_val, lr_pred)) lr_r2 = r2_score(y_val, lr_pred) models['Linear Regression'] = lr_model results['Linear Regression'] = { 'model': lr_model, 'mae': lr_mae, 'rmse': lr_rmse, 'r2': lr_r2, 'predictions': lr_pred } print(f" MAE: {lr_mae:.2f}, RMSE: {lr_rmse:.2f}, R2: {lr_r2:.4f}") # 2. Random Forest Regressor print("\n2. Training Random Forest Regressor...") rf_model = RandomForestRegressor( n_estimators=100, max_depth=15, min_samples_split=5, min_samples_leaf=2, random_state=42, n_jobs=-1 ) rf_model.fit(X_train, y_train) rf_pred = rf_model.predict(X_val) rf_mae = mean_absolute_error(y_val, rf_pred) rf_rmse = np.sqrt(mean_squared_error(y_val, rf_pred)) rf_r2 = r2_score(y_val, rf_pred) models['Random Forest'] = rf_model results['Random Forest'] = { 'model': rf_model, 'mae': rf_mae, 'rmse': rf_rmse, 'r2': rf_r2, 'predictions': rf_pred } print(f" MAE: {rf_mae:.2f}, RMSE: {rf_rmse:.2f}, R2: {rf_r2:.4f}") # 3. XGBoost (if available) if XGBOOST_AVAILABLE: print("\n3. Training XGBoost Regressor...") xgb_model = xgb.XGBRegressor( n_estimators=100, max_depth=6, learning_rate=0.1, random_state=42, n_jobs=-1 ) xgb_model.fit(X_train, y_train) xgb_pred = xgb_model.predict(X_val) xgb_mae = mean_absolute_error(y_val, xgb_pred) xgb_rmse = np.sqrt(mean_squared_error(y_val, xgb_pred)) xgb_r2 = r2_score(y_val, xgb_pred) models['XGBoost'] = xgb_model results['XGBoost'] = { 'model': xgb_model, 'mae': xgb_mae, 'rmse': xgb_rmse, 'r2': xgb_r2, 'predictions': xgb_pred } print(f" MAE: {xgb_mae:.2f}, RMSE: {xgb_rmse:.2f}, R2: {xgb_r2:.4f}") else: print("\n3. XGBoost skipped (not available)") return results def prepare_time_series_data(df): """ Prepare time-series data by aggregating daily sales. Args: df: DataFrame with date and sales_quantity columns Returns: tuple: (ts_data, train_size) - time series data and training size """ print("\n" + "="*60) print("PREPARING TIME-SERIES DATA") print("="*60) # Aggregate by date df['date'] = pd.to_datetime(df['date']) ts_data = df.groupby('date')['sales_quantity'].sum().reset_index() ts_data = ts_data.sort_values('date').reset_index(drop=True) ts_data.columns = ['ds', 'y'] # Prophet expects 'ds' and 'y' print(f"Time-series data shape: {ts_data.shape}") print(f"Date range: {ts_data['ds'].min()} to {ts_data['ds'].max()}") print(f"Total days: {len(ts_data)}") # Use 80% for training (chronological split for time-series) train_size = int(len(ts_data) * 0.8) return ts_data, train_size def train_arima(ts_data, train_size): """ Train ARIMA model on time-series data. Args: ts_data: Time-series DataFrame with 'ds' and 'y' columns train_size: Number of samples for training Returns: dict: Model results dictionary """ if not ARIMA_AVAILABLE: return None print("\n" + "="*60) print("TRAINING ARIMA MODEL") print("="*60) try: # Split data chronologically train_data = ts_data['y'].iloc[:train_size].values val_data = ts_data['y'].iloc[train_size:].values val_dates = ts_data['ds'].iloc[train_size:].values print(f"Training on {len(train_data)} samples") print(f"Validating on {len(val_data)} samples") # Try different ARIMA orders (p, d, q) # Start with auto_arima-like approach - try common orders best_aic = np.inf best_order = None best_model = None # Common ARIMA orders to try orders_to_try = [ (1, 1, 1), # Standard ARIMA(1,1,1) (2, 1, 2), # ARIMA(2,1,2) (1, 1, 0), # ARIMA(1,1,0) - AR model (0, 1, 1), # ARIMA(0,1,1) - MA model (2, 1, 1), # ARIMA(2,1,1) (1, 1, 2), # ARIMA(1,1,2) ] print("Trying different ARIMA orders...") for order in orders_to_try: try: model = ARIMA(train_data, order=order) fitted_model = model.fit() aic = fitted_model.aic if aic < best_aic: best_aic = aic best_order = order best_model = fitted_model print(f" Order {order}: AIC = {aic:.2f} (best so far)") else: print(f" Order {order}: AIC = {aic:.2f}") except Exception as e: print(f" Order {order}: Failed - {str(e)[:50]}") continue if best_model is None: print("Failed to fit ARIMA model with any order") return None print(f"\nBest ARIMA order: {best_order} (AIC: {best_aic:.2f})") # Make predictions forecast_steps = len(val_data) forecast = best_model.forecast(steps=forecast_steps) # Ensure predictions are non-negative forecast = np.maximum(forecast, 0) # Calculate metrics mae = mean_absolute_error(val_data, forecast) rmse = np.sqrt(mean_squared_error(val_data, forecast)) r2 = r2_score(val_data, forecast) print(f" MAE: {mae:.2f}, RMSE: {rmse:.2f}, R2: {r2:.4f}") return { 'model': best_model, 'order': best_order, 'mae': mae, 'rmse': rmse, 'r2': r2, 'predictions': forecast, 'actual': val_data, 'dates': val_dates } except Exception as e: print(f"Error training ARIMA: {str(e)}") return None def train_prophet(ts_data, train_size): """ Train Prophet model on time-series data. Args: ts_data: Time-series DataFrame with 'ds' and 'y' columns train_size: Number of samples for training Returns: dict: Model results dictionary """ if not PROPHET_AVAILABLE: return None print("\n" + "="*60) print("TRAINING PROPHET MODEL") print("="*60) try: # Split data chronologically train_data = ts_data.iloc[:train_size].copy() val_data = ts_data.iloc[train_size:].copy() print(f"Training on {len(train_data)} samples") print(f"Validating on {len(val_data)} samples") # Initialize and fit Prophet model # Enable daily seasonality and weekly/yearly seasonality model = Prophet( daily_seasonality=False, # Disable daily for daily data weekly_seasonality=True, yearly_seasonality=True, seasonality_mode='multiplicative', changepoint_prior_scale=0.05 ) print("Fitting Prophet model...") model.fit(train_data) # Create future dataframe for validation period future = model.make_future_dataframe(periods=len(val_data), freq='D') # Make predictions forecast = model.predict(future) # Get predictions for validation period val_forecast = forecast.iloc[train_size:]['yhat'].values val_actual = val_data['y'].values # Ensure predictions are non-negative val_forecast = np.maximum(val_forecast, 0) # Calculate metrics mae = mean_absolute_error(val_actual, val_forecast) rmse = np.sqrt(mean_squared_error(val_actual, val_forecast)) r2 = r2_score(val_actual, val_forecast) print(f" MAE: {mae:.2f}, RMSE: {rmse:.2f}, R2: {r2:.4f}") return { 'model': model, 'mae': mae, 'rmse': rmse, 'r2': r2, 'predictions': val_forecast, 'actual': val_actual, 'dates': val_data['ds'].values, 'full_forecast': forecast } except Exception as e: print(f"Error training Prophet: {str(e)}") import traceback traceback.print_exc() return None def select_best_model(results): """ Select the best model based on R2 score (higher is better). Args: results: Dictionary containing model results Returns: tuple: (best_model_name, best_model, best_metrics) """ print("\n" + "="*60) print("MODEL COMPARISON") print("="*60) # Create comparison DataFrame comparison_data = [] for model_name, metrics in results.items(): comparison_data.append({ 'Model': model_name, 'MAE': metrics['mae'], 'RMSE': metrics['rmse'], 'R2 Score': metrics['r2'] }) comparison_df = pd.DataFrame(comparison_data) print("\nModel Performance Comparison:") print(comparison_df.to_string(index=False)) # Select best model based on R2 score best_model_name = max(results.keys(), key=lambda x: results[x]['r2']) best_model = results[best_model_name]['model'] best_metrics = { 'mae': results[best_model_name]['mae'], 'rmse': results[best_model_name]['rmse'], 'r2': results[best_model_name]['r2'] } print(f"\n{'='*60}") print(f"BEST MODEL: {best_model_name}") print(f"MAE: {best_metrics['mae']:.2f}") print(f"RMSE: {best_metrics['rmse']:.2f}") print(f"R2 Score: {best_metrics['r2']:.4f}") print(f"{'='*60}") return best_model_name, best_model, best_metrics def visualize_results(df, results, best_model_name, feature_names): """ Create visualizations: demand trends, feature importance, model comparison. Args: df: Original DataFrame results: Model results dictionary best_model_name: Name of the best model feature_names: List of feature names """ print("\n" + "="*60) print("GENERATING VISUALIZATIONS") print("="*60) # Set style sns.set_style("whitegrid") plt.rcParams['figure.figsize'] = (12, 6) # 1. Demand trends over time print("1. Plotting demand trends over time...") df['date'] = pd.to_datetime(df['date']) daily_demand = df.groupby('date')['sales_quantity'].sum().reset_index() plt.figure(figsize=(14, 6)) plt.plot(daily_demand['date'], daily_demand['sales_quantity'], linewidth=1, alpha=0.7) plt.title('Total Daily Sales Quantity Over Time', fontsize=16, fontweight='bold') plt.xlabel('Date', fontsize=12) plt.ylabel('Total Sales Quantity', fontsize=12) plt.grid(True, alpha=0.3) plt.tight_layout() plt.savefig(f'{PLOTS_DIR}/demand_trends.png', dpi=300, bbox_inches='tight') print(f" Saved: {PLOTS_DIR}/demand_trends.png") plt.close() # 2. Monthly average demand print("2. Plotting monthly average demand...") df['month_name'] = pd.to_datetime(df['date']).dt.strftime('%B') monthly_avg = df.groupby('month')['sales_quantity'].mean().reset_index() month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] monthly_avg['month_name'] = monthly_avg['month'].apply(lambda x: month_names[x-1]) plt.figure(figsize=(12, 6)) plt.bar(monthly_avg['month_name'], monthly_avg['sales_quantity'], color='steelblue', alpha=0.7) plt.title('Average Sales Quantity by Month', fontsize=16, fontweight='bold') plt.xlabel('Month', fontsize=12) plt.ylabel('Average Sales Quantity', fontsize=12) plt.xticks(rotation=45) plt.grid(True, alpha=0.3, axis='y') plt.tight_layout() plt.savefig(f'{PLOTS_DIR}/monthly_demand.png', dpi=300, bbox_inches='tight') print(f" Saved: {PLOTS_DIR}/monthly_demand.png") plt.close() # 3. Feature importance (for tree-based models) print("3. Plotting feature importance...") best_model = results[best_model_name]['model'] if hasattr(best_model, 'feature_importances_'): importances = best_model.feature_importances_ feature_importance_df = pd.DataFrame({ 'feature': feature_names, 'importance': importances }).sort_values('importance', ascending=False) plt.figure(figsize=(10, 6)) plt.barh(feature_importance_df['feature'], feature_importance_df['importance'], color='coral', alpha=0.7) plt.title(f'Feature Importance - {best_model_name}', fontsize=16, fontweight='bold') plt.xlabel('Importance', fontsize=12) plt.ylabel('Feature', fontsize=12) plt.gca().invert_yaxis() plt.grid(True, alpha=0.3, axis='x') plt.tight_layout() plt.savefig(f'{PLOTS_DIR}/feature_importance.png', dpi=300, bbox_inches='tight') print(f" Saved: {PLOTS_DIR}/feature_importance.png") plt.close() else: print(" Feature importance not available for this model type") # 4. Model comparison print("4. Plotting model comparison...") model_names = list(results.keys()) mae_scores = [results[m]['mae'] for m in model_names] rmse_scores = [results[m]['rmse'] for m in model_names] r2_scores = [results[m]['r2'] for m in model_names] # Separate ML and time-series models for visualization ml_models = [m for m in model_names if m not in ['ARIMA', 'Prophet']] ts_models = [m for m in model_names if m in ['ARIMA', 'Prophet']] fig, axes = plt.subplots(1, 3, figsize=(18, 5)) # Color code: ML models in blue tones, TS models in orange/red tones colors = [] for m in model_names: if m in ts_models: colors.append('coral' if m == 'ARIMA' else 'salmon') else: colors.append('skyblue') # MAE comparison axes[0].bar(model_names, mae_scores, color=colors, alpha=0.7) axes[0].set_title('MAE Comparison (Lower is Better)', fontsize=14, fontweight='bold') axes[0].set_ylabel('MAE', fontsize=12) axes[0].tick_params(axis='x', rotation=45) axes[0].grid(True, alpha=0.3, axis='y') # Add legend from matplotlib.patches import Patch legend_elements = [ Patch(facecolor='skyblue', alpha=0.7, label='ML Models'), Patch(facecolor='coral', alpha=0.7, label='Time-Series Models') ] axes[0].legend(handles=legend_elements, loc='upper right') # RMSE comparison axes[1].bar(model_names, rmse_scores, color=colors, alpha=0.7) axes[1].set_title('RMSE Comparison (Lower is Better)', fontsize=14, fontweight='bold') axes[1].set_ylabel('RMSE', fontsize=12) axes[1].tick_params(axis='x', rotation=45) axes[1].grid(True, alpha=0.3, axis='y') # R2 comparison axes[2].bar(model_names, r2_scores, color=colors, alpha=0.7) axes[2].set_title('R2 Score Comparison (Higher is Better)', fontsize=14, fontweight='bold') axes[2].set_ylabel('R2 Score', fontsize=12) axes[2].tick_params(axis='x', rotation=45) axes[2].grid(True, alpha=0.3, axis='y') plt.tight_layout() plt.savefig(f'{PLOTS_DIR}/model_comparison.png', dpi=300, bbox_inches='tight') print(f" Saved: {PLOTS_DIR}/model_comparison.png") plt.close() # 5. Time-series predictions plot (if time-series models available) if ts_models: print("5. Plotting time-series model predictions...") fig, axes = plt.subplots(len(ts_models), 1, figsize=(14, 6*len(ts_models))) if len(ts_models) == 1: axes = [axes] for idx, model_name in enumerate(ts_models): if model_name in results and 'dates' in results[model_name]: dates = pd.to_datetime(results[model_name]['dates']) actual = results[model_name]['actual'] predictions = results[model_name]['predictions'] axes[idx].plot(dates, actual, label='Actual', linewidth=2, alpha=0.7) axes[idx].plot(dates, predictions, label='Predicted', linewidth=2, alpha=0.7, linestyle='--') axes[idx].set_title(f'{model_name} - Actual vs Predicted', fontsize=14, fontweight='bold') axes[idx].set_xlabel('Date', fontsize=12) axes[idx].set_ylabel('Sales Quantity', fontsize=12) axes[idx].legend() axes[idx].grid(True, alpha=0.3) plt.tight_layout() plt.savefig(f'{PLOTS_DIR}/timeseries_predictions.png', dpi=300, bbox_inches='tight') print(f" Saved: {PLOTS_DIR}/timeseries_predictions.png") plt.close() print(" Visualization complete!") def save_model(model, encoders, scaler, feature_names, best_model_name, best_metrics): """ Save the trained model and preprocessing objects. Args: model: Trained model encoders: Dictionary of encoders scaler: Fitted scaler feature_names: List of feature names best_model_name: Name of the best model best_metrics: Dictionary of metrics """ print("\n" + "="*60) print("SAVING MODEL") print("="*60) # Save model model_path = f'{MODEL_DIR}/best_model.joblib' joblib.dump(model, model_path) print(f"Model saved to: {model_path}") # Save encoders and scaler preprocessing_path = f'{MODEL_DIR}/preprocessing.joblib' preprocessing_data = { 'encoders': encoders, 'scaler': scaler, 'feature_names': feature_names } joblib.dump(preprocessing_data, preprocessing_path) print(f"Preprocessing objects saved to: {preprocessing_path}") # Save model metadata metadata = { 'model_name': best_model_name, 'metrics': best_metrics, 'feature_names': feature_names, 'saved_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S') } import json metadata_path = f'{MODEL_DIR}/model_metadata.json' with open(metadata_path, 'w') as f: json.dump(metadata, f, indent=4) print(f"Model metadata saved to: {metadata_path}") def main(): """ Main function to orchestrate the training pipeline. """ print("\n" + "="*60) print("DEMAND PREDICTION SYSTEM - MODEL TRAINING") print("ML Models vs Time-Series Models Comparison") print("="*60) # Step 1: Load data df = load_data(DATA_PATH) # Step 2: Preprocess data df_processed = preprocess_data(df) # Step 3: Feature engineering for ML models X, y, feature_names, encoders, scaler = feature_engineering(df_processed) # Step 4: Split data for ML models (random split) print("\n" + "="*60) print("SPLITTING DATA FOR ML MODELS") print("="*60) X_train, X_val, y_train, y_val = train_test_split( X, y, test_size=0.2, random_state=42 ) print(f"Training set: {X_train.shape[0]} samples") print(f"Validation set: {X_val.shape[0]} samples") # Step 5: Train ML models print("\n" + "="*70) print("TRAINING MACHINE LEARNING MODELS") print("="*70) results = train_models(X_train, y_train, X_val, y_val) # Step 6: Prepare time-series data ts_data, train_size = prepare_time_series_data(df_processed) # Step 7: Train time-series models print("\n" + "="*70) print("TRAINING TIME-SERIES MODELS") print("="*70) # Train ARIMA if ARIMA_AVAILABLE: arima_results = train_arima(ts_data, train_size) if arima_results: results['ARIMA'] = arima_results else: print("\nARIMA skipped (statsmodels not available)") # Train Prophet if PROPHET_AVAILABLE: prophet_results = train_prophet(ts_data, train_size) if prophet_results: results['Prophet'] = prophet_results else: print("\nProphet skipped (prophet not available)") # Step 8: Select best model (across all model types) best_model_name, best_model, best_metrics = select_best_model(results) # Step 9: Visualize results visualize_results(df_processed, results, best_model_name, feature_names) # Step 10: Save model (only ML models can be saved with preprocessing) # For time-series models, save separately if best_model_name not in ['ARIMA', 'Prophet']: save_model(best_model, encoders, scaler, feature_names, best_model_name, best_metrics) else: # Save time-series model separately print("\n" + "="*60) print("SAVING TIME-SERIES MODEL") print("="*60) ts_model_path = f'{MODEL_DIR}/best_timeseries_model.joblib' joblib.dump(best_model, ts_model_path) print(f"Time-series model saved to: {ts_model_path}") # Also save preprocessing for ML models (in case user wants to use them) preprocessing_path = f'{MODEL_DIR}/preprocessing.joblib' preprocessing_data = { 'encoders': encoders, 'scaler': scaler, 'feature_names': feature_names } joblib.dump(preprocessing_data, preprocessing_path) print(f"ML preprocessing objects saved to: {preprocessing_path}") # Save all results metadata import json all_models_metadata = { 'best_model': best_model_name, 'best_metrics': best_metrics, 'all_models': {} } for model_name, model_results in results.items(): all_models_metadata['all_models'][model_name] = { 'mae': model_results['mae'], 'rmse': model_results['rmse'], 'r2': model_results['r2'] } all_models_metadata['saved_at'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S') metadata_path = f'{MODEL_DIR}/all_models_metadata.json' with open(metadata_path, 'w') as f: json.dump(all_models_metadata, f, indent=4) print(f"All models metadata saved to: {metadata_path}") print("\n" + "="*60) print("TRAINING COMPLETE!") print("="*60) print(f"\nBest model: {best_model_name}") print(f"Model type: {'Time-Series' if best_model_name in ['ARIMA', 'Prophet'] else 'Machine Learning'}") print(f"Model saved to: {MODEL_DIR}/") print(f"Visualizations saved to: {PLOTS_DIR}/") print("\nYou can now use predict.py to make predictions!") if __name__ == "__main__": main()