E-commerce-demand-prediction-system / train_model.py

Upload 17 files

7f90ea0 verified 4 days ago

30.1 kB

	"""
	Demand Prediction System - Model Training Script

	This script trains multiple machine learning and time-series models to predict
	product demand (sales quantity) for an e-commerce platform.

	Features:
	- Data preprocessing and feature engineering
	- Date feature extraction (day, month, day_of_week, weekend)
	- Categorical encoding
	- Feature scaling
	- Multiple ML models (Linear Regression, Random Forest, XGBoost)
	- Time-series models (ARIMA, Prophet)
	- Model evaluation (MAE, RMSE, R2 Score)
	- Automatic best model selection
	- Model persistence using joblib
	- Visualization of results
	- Comparison between ML and time-series approaches
	"""

	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	import seaborn as sns
	from datetime import datetime
	import joblib
	import os
	import warnings
	warnings.filterwarnings('ignore')

	# Machine Learning imports
	from sklearn.model_selection import train_test_split
	from sklearn.preprocessing import StandardScaler, LabelEncoder
	from sklearn.linear_model import LinearRegression
	from sklearn.ensemble import RandomForestRegressor
	from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

	# Try to import XGBoost (optional)
	try:
	import xgboost as xgb
	XGBOOST_AVAILABLE = True
	except ImportError:
	XGBOOST_AVAILABLE = False
	print("XGBoost not available. Install with: pip install xgboost")

	# Try to import time-series libraries
	try:
	from statsmodels.tsa.arima.model import ARIMA
	from statsmodels.tsa.stattools import adfuller
	ARIMA_AVAILABLE = True
	except ImportError:
	ARIMA_AVAILABLE = False
	print("statsmodels not available. Install with: pip install statsmodels")

	try:
	from prophet import Prophet
	PROPHET_AVAILABLE = True
	except ImportError:
	PROPHET_AVAILABLE = False
	print("Prophet not available. Install with: pip install prophet")

	# Set random seeds for reproducibility
	np.random.seed(42)

	# Configuration
	DATA_PATH = 'data/sales.csv'
	MODEL_DIR = 'models'
	PLOTS_DIR = 'plots'

	# Create directories if they don't exist
	os.makedirs(MODEL_DIR, exist_ok=True)
	os.makedirs(PLOTS_DIR, exist_ok=True)


	def load_data(file_path):
	"""
	Load the sales dataset from CSV file.

	Args:
	file_path: Path to the CSV file

	Returns:
	DataFrame: Loaded dataset
	"""
	print(f"Loading data from {file_path}...")
	df = pd.read_csv(file_path)
	print(f"Data loaded successfully! Shape: {df.shape}")
	return df


	def preprocess_data(df):
	"""
	Preprocess the data: convert date, extract features, handle missing values.

	Args:
	df: Raw DataFrame

	Returns:
	DataFrame: Preprocessed DataFrame
	"""
	print("\n" + "="*60)
	print("PREPROCESSING DATA")
	print("="*60)

	# Create a copy to avoid modifying original
	df = df.copy()

	# Convert date column to datetime
	df['date'] = pd.to_datetime(df['date'])

	# Extract date features
	print("Extracting date features...")
	df['day'] = df['date'].dt.day
	df['month'] = df['date'].dt.month
	df['day_of_week'] = df['date'].dt.dayofweek # 0=Monday, 6=Sunday
	df['weekend'] = (df['day_of_week'] >= 5).astype(int) # 1 if weekend, 0 otherwise
	df['year'] = df['date'].dt.year
	df['quarter'] = df['date'].dt.quarter

	# Check for missing values
	print("\nMissing values:")
	missing = df.isnull().sum()
	print(missing[missing > 0])

	if missing.sum() > 0:
	print("Filling missing values...")
	df = df.fillna(df.median(numeric_only=True))

	# Display basic statistics
	print("\nDataset Info:")
	print(f"Shape: {df.shape}")
	print(f"\nColumns: {df.columns.tolist()}")
	print(f"\nData types:\n{df.dtypes}")
	print(f"\nBasic statistics:\n{df.describe()}")

	return df


	def feature_engineering(df):
	"""
	Perform feature engineering: encode categorical variables, scale features.

	Args:
	df: Preprocessed DataFrame

	Returns:
	tuple: (X_features, y_target, feature_names, encoders, scaler)
	"""
	print("\n" + "="*60)
	print("FEATURE ENGINEERING")
	print("="*60)

	# Separate features and target
	# Drop original date column (we have extracted features from it)
	# Keep product_id for now (we'll encode it)
	feature_columns = ['product_id', 'price', 'discount', 'category',
	'day', 'month', 'day_of_week', 'weekend', 'year', 'quarter']

	X = df[feature_columns].copy()
	y = df['sales_quantity'].copy()

	# Encode categorical variables
	print("Encoding categorical variables...")

	# Label encode category
	category_encoder = LabelEncoder()
	X['category_encoded'] = category_encoder.fit_transform(X['category'])

	# Label encode product_id (treating it as categorical)
	product_encoder = LabelEncoder()
	X['product_id_encoded'] = product_encoder.fit_transform(X['product_id'])

	# Drop original categorical columns
	X = X.drop(['category', 'product_id'], axis=1)

	# Get feature names
	feature_names = X.columns.tolist()

	print(f"Features after encoding: {feature_names}")
	print(f"Number of features: {len(feature_names)}")

	# Scale numerical features
	print("\nScaling numerical features...")
	scaler = StandardScaler()
	X_scaled = scaler.fit_transform(X)
	X_scaled = pd.DataFrame(X_scaled, columns=feature_names)

	# Store encoders and scaler for later use
	encoders = {
	'category': category_encoder,
	'product_id': product_encoder,
	'scaler': scaler
	}

	return X_scaled, y, feature_names, encoders, scaler


	def train_models(X_train, y_train, X_val, y_val):
	"""
	Train multiple models and return their performance metrics.

	Args:
	X_train: Training features
	y_train: Training target
	X_val: Validation features
	y_val: Validation target

	Returns:
	dict: Dictionary containing models and their metrics
	"""
	print("\n" + "="*60)
	print("TRAINING MODELS")
	print("="*60)

	models = {}
	results = {}

	# 1. Linear Regression
	print("\n1. Training Linear Regression...")
	lr_model = LinearRegression()
	lr_model.fit(X_train, y_train)
	lr_pred = lr_model.predict(X_val)

	lr_mae = mean_absolute_error(y_val, lr_pred)
	lr_rmse = np.sqrt(mean_squared_error(y_val, lr_pred))
	lr_r2 = r2_score(y_val, lr_pred)

	models['Linear Regression'] = lr_model
	results['Linear Regression'] = {
	'model': lr_model,
	'mae': lr_mae,
	'rmse': lr_rmse,
	'r2': lr_r2,
	'predictions': lr_pred
	}

	print(f" MAE: {lr_mae:.2f}, RMSE: {lr_rmse:.2f}, R2: {lr_r2:.4f}")

	# 2. Random Forest Regressor
	print("\n2. Training Random Forest Regressor...")
	rf_model = RandomForestRegressor(
	n_estimators=100,
	max_depth=15,
	min_samples_split=5,
	min_samples_leaf=2,
	random_state=42,
	n_jobs=-1
	)
	rf_model.fit(X_train, y_train)
	rf_pred = rf_model.predict(X_val)

	rf_mae = mean_absolute_error(y_val, rf_pred)
	rf_rmse = np.sqrt(mean_squared_error(y_val, rf_pred))
	rf_r2 = r2_score(y_val, rf_pred)

	models['Random Forest'] = rf_model
	results['Random Forest'] = {
	'model': rf_model,
	'mae': rf_mae,
	'rmse': rf_rmse,
	'r2': rf_r2,
	'predictions': rf_pred
	}

	print(f" MAE: {rf_mae:.2f}, RMSE: {rf_rmse:.2f}, R2: {rf_r2:.4f}")

	# 3. XGBoost (if available)
	if XGBOOST_AVAILABLE:
	print("\n3. Training XGBoost Regressor...")
	xgb_model = xgb.XGBRegressor(
	n_estimators=100,
	max_depth=6,
	learning_rate=0.1,
	random_state=42,
	n_jobs=-1
	)
	xgb_model.fit(X_train, y_train)
	xgb_pred = xgb_model.predict(X_val)

	xgb_mae = mean_absolute_error(y_val, xgb_pred)
	xgb_rmse = np.sqrt(mean_squared_error(y_val, xgb_pred))
	xgb_r2 = r2_score(y_val, xgb_pred)

	models['XGBoost'] = xgb_model
	results['XGBoost'] = {
	'model': xgb_model,
	'mae': xgb_mae,
	'rmse': xgb_rmse,
	'r2': xgb_r2,
	'predictions': xgb_pred
	}

	print(f" MAE: {xgb_mae:.2f}, RMSE: {xgb_rmse:.2f}, R2: {xgb_r2:.4f}")
	else:
	print("\n3. XGBoost skipped (not available)")

	return results


	def prepare_time_series_data(df):
	"""
	Prepare time-series data by aggregating daily sales.

	Args:
	df: DataFrame with date and sales_quantity columns

	Returns:
	tuple: (ts_data, train_size) - time series data and training size
	"""
	print("\n" + "="*60)
	print("PREPARING TIME-SERIES DATA")
	print("="*60)

	# Aggregate by date
	df['date'] = pd.to_datetime(df['date'])
	ts_data = df.groupby('date')['sales_quantity'].sum().reset_index()
	ts_data = ts_data.sort_values('date').reset_index(drop=True)
	ts_data.columns = ['ds', 'y'] # Prophet expects 'ds' and 'y'

	print(f"Time-series data shape: {ts_data.shape}")
	print(f"Date range: {ts_data['ds'].min()} to {ts_data['ds'].max()}")
	print(f"Total days: {len(ts_data)}")

	# Use 80% for training (chronological split for time-series)
	train_size = int(len(ts_data) * 0.8)

	return ts_data, train_size


	def train_arima(ts_data, train_size):
	"""
	Train ARIMA model on time-series data.

	Args:
	ts_data: Time-series DataFrame with 'ds' and 'y' columns
	train_size: Number of samples for training

	Returns:
	dict: Model results dictionary
	"""
	if not ARIMA_AVAILABLE:
	return None

	print("\n" + "="*60)
	print("TRAINING ARIMA MODEL")
	print("="*60)

	try:
	# Split data chronologically
	train_data = ts_data['y'].iloc[:train_size].values
	val_data = ts_data['y'].iloc[train_size:].values
	val_dates = ts_data['ds'].iloc[train_size:].values

	print(f"Training on {len(train_data)} samples")
	print(f"Validating on {len(val_data)} samples")

	# Try different ARIMA orders (p, d, q)
	# Start with auto_arima-like approach - try common orders
	best_aic = np.inf
	best_order = None
	best_model = None

	# Common ARIMA orders to try
	orders_to_try = [
	(1, 1, 1), # Standard ARIMA(1,1,1)
	(2, 1, 2), # ARIMA(2,1,2)
	(1, 1, 0), # ARIMA(1,1,0) - AR model
	(0, 1, 1), # ARIMA(0,1,1) - MA model
	(2, 1, 1), # ARIMA(2,1,1)
	(1, 1, 2), # ARIMA(1,1,2)
	]

	print("Trying different ARIMA orders...")
	for order in orders_to_try:
	try:
	model = ARIMA(train_data, order=order)
	fitted_model = model.fit()
	aic = fitted_model.aic

	if aic < best_aic:
	best_aic = aic
	best_order = order
	best_model = fitted_model
	print(f" Order {order}: AIC = {aic:.2f} (best so far)")
	else:
	print(f" Order {order}: AIC = {aic:.2f}")
	except Exception as e:
	print(f" Order {order}: Failed - {str(e)[:50]}")
	continue

	if best_model is None:
	print("Failed to fit ARIMA model with any order")
	return None

	print(f"\nBest ARIMA order: {best_order} (AIC: {best_aic:.2f})")

	# Make predictions
	forecast_steps = len(val_data)
	forecast = best_model.forecast(steps=forecast_steps)

	# Ensure predictions are non-negative
	forecast = np.maximum(forecast, 0)

	# Calculate metrics
	mae = mean_absolute_error(val_data, forecast)
	rmse = np.sqrt(mean_squared_error(val_data, forecast))
	r2 = r2_score(val_data, forecast)

	print(f" MAE: {mae:.2f}, RMSE: {rmse:.2f}, R2: {r2:.4f}")

	return {
	'model': best_model,
	'order': best_order,
	'mae': mae,
	'rmse': rmse,
	'r2': r2,
	'predictions': forecast,
	'actual': val_data,
	'dates': val_dates
	}

	except Exception as e:
	print(f"Error training ARIMA: {str(e)}")
	return None


	def train_prophet(ts_data, train_size):
	"""
	Train Prophet model on time-series data.

	Args:
	ts_data: Time-series DataFrame with 'ds' and 'y' columns
	train_size: Number of samples for training

	Returns:
	dict: Model results dictionary
	"""
	if not PROPHET_AVAILABLE:
	return None

	print("\n" + "="*60)
	print("TRAINING PROPHET MODEL")
	print("="*60)

	try:
	# Split data chronologically
	train_data = ts_data.iloc[:train_size].copy()
	val_data = ts_data.iloc[train_size:].copy()

	print(f"Training on {len(train_data)} samples")
	print(f"Validating on {len(val_data)} samples")

	# Initialize and fit Prophet model
	# Enable daily seasonality and weekly/yearly seasonality
	model = Prophet(
	daily_seasonality=False, # Disable daily for daily data
	weekly_seasonality=True,
	yearly_seasonality=True,
	seasonality_mode='multiplicative',
	changepoint_prior_scale=0.05
	)

	print("Fitting Prophet model...")
	model.fit(train_data)

	# Create future dataframe for validation period
	future = model.make_future_dataframe(periods=len(val_data), freq='D')

	# Make predictions
	forecast = model.predict(future)

	# Get predictions for validation period
	val_forecast = forecast.iloc[train_size:]['yhat'].values
	val_actual = val_data['y'].values

	# Ensure predictions are non-negative
	val_forecast = np.maximum(val_forecast, 0)

	# Calculate metrics
	mae = mean_absolute_error(val_actual, val_forecast)
	rmse = np.sqrt(mean_squared_error(val_actual, val_forecast))
	r2 = r2_score(val_actual, val_forecast)

	print(f" MAE: {mae:.2f}, RMSE: {rmse:.2f}, R2: {r2:.4f}")

	return {
	'model': model,
	'mae': mae,
	'rmse': rmse,
	'r2': r2,
	'predictions': val_forecast,
	'actual': val_actual,
	'dates': val_data['ds'].values,
	'full_forecast': forecast
	}

	except Exception as e:
	print(f"Error training Prophet: {str(e)}")
	import traceback
	traceback.print_exc()
	return None


	def select_best_model(results):
	"""
	Select the best model based on R2 score (higher is better).

	Args:
	results: Dictionary containing model results

	Returns:
	tuple: (best_model_name, best_model, best_metrics)
	"""
	print("\n" + "="*60)
	print("MODEL COMPARISON")
	print("="*60)

	# Create comparison DataFrame
	comparison_data = []
	for model_name, metrics in results.items():
	comparison_data.append({
	'Model': model_name,
	'MAE': metrics['mae'],
	'RMSE': metrics['rmse'],
	'R2 Score': metrics['r2']
	})

	comparison_df = pd.DataFrame(comparison_data)
	print("\nModel Performance Comparison:")
	print(comparison_df.to_string(index=False))

	# Select best model based on R2 score
	best_model_name = max(results.keys(), key=lambda x: results[x]['r2'])
	best_model = results[best_model_name]['model']
	best_metrics = {
	'mae': results[best_model_name]['mae'],
	'rmse': results[best_model_name]['rmse'],
	'r2': results[best_model_name]['r2']
	}

	print(f"\n{'='*60}")
	print(f"BEST MODEL: {best_model_name}")
	print(f"MAE: {best_metrics['mae']:.2f}")
	print(f"RMSE: {best_metrics['rmse']:.2f}")
	print(f"R2 Score: {best_metrics['r2']:.4f}")
	print(f"{'='*60}")

	return best_model_name, best_model, best_metrics


	def visualize_results(df, results, best_model_name, feature_names):
	"""
	Create visualizations: demand trends, feature importance, model comparison.

	Args:
	df: Original DataFrame
	results: Model results dictionary
	best_model_name: Name of the best model
	feature_names: List of feature names
	"""
	print("\n" + "="*60)
	print("GENERATING VISUALIZATIONS")
	print("="*60)

	# Set style
	sns.set_style("whitegrid")
	plt.rcParams['figure.figsize'] = (12, 6)

	# 1. Demand trends over time
	print("1. Plotting demand trends over time...")
	df['date'] = pd.to_datetime(df['date'])
	daily_demand = df.groupby('date')['sales_quantity'].sum().reset_index()

	plt.figure(figsize=(14, 6))
	plt.plot(daily_demand['date'], daily_demand['sales_quantity'], linewidth=1, alpha=0.7)
	plt.title('Total Daily Sales Quantity Over Time', fontsize=16, fontweight='bold')
	plt.xlabel('Date', fontsize=12)
	plt.ylabel('Total Sales Quantity', fontsize=12)
	plt.grid(True, alpha=0.3)
	plt.tight_layout()
	plt.savefig(f'{PLOTS_DIR}/demand_trends.png', dpi=300, bbox_inches='tight')
	print(f" Saved: {PLOTS_DIR}/demand_trends.png")
	plt.close()

	# 2. Monthly average demand
	print("2. Plotting monthly average demand...")
	df['month_name'] = pd.to_datetime(df['date']).dt.strftime('%B')
	monthly_avg = df.groupby('month')['sales_quantity'].mean().reset_index()
	month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
	'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
	monthly_avg['month_name'] = monthly_avg['month'].apply(lambda x: month_names[x-1])

	plt.figure(figsize=(12, 6))
	plt.bar(monthly_avg['month_name'], monthly_avg['sales_quantity'], color='steelblue', alpha=0.7)
	plt.title('Average Sales Quantity by Month', fontsize=16, fontweight='bold')
	plt.xlabel('Month', fontsize=12)
	plt.ylabel('Average Sales Quantity', fontsize=12)
	plt.xticks(rotation=45)
	plt.grid(True, alpha=0.3, axis='y')
	plt.tight_layout()
	plt.savefig(f'{PLOTS_DIR}/monthly_demand.png', dpi=300, bbox_inches='tight')
	print(f" Saved: {PLOTS_DIR}/monthly_demand.png")
	plt.close()

	# 3. Feature importance (for tree-based models)
	print("3. Plotting feature importance...")
	best_model = results[best_model_name]['model']

	if hasattr(best_model, 'feature_importances_'):
	importances = best_model.feature_importances_
	feature_importance_df = pd.DataFrame({
	'feature': feature_names,
	'importance': importances
	}).sort_values('importance', ascending=False)

	plt.figure(figsize=(10, 6))
	plt.barh(feature_importance_df['feature'], feature_importance_df['importance'], color='coral', alpha=0.7)
	plt.title(f'Feature Importance - {best_model_name}', fontsize=16, fontweight='bold')
	plt.xlabel('Importance', fontsize=12)
	plt.ylabel('Feature', fontsize=12)
	plt.gca().invert_yaxis()
	plt.grid(True, alpha=0.3, axis='x')
	plt.tight_layout()
	plt.savefig(f'{PLOTS_DIR}/feature_importance.png', dpi=300, bbox_inches='tight')
	print(f" Saved: {PLOTS_DIR}/feature_importance.png")
	plt.close()
	else:
	print(" Feature importance not available for this model type")

	# 4. Model comparison
	print("4. Plotting model comparison...")
	model_names = list(results.keys())
	mae_scores = [results[m]['mae'] for m in model_names]
	rmse_scores = [results[m]['rmse'] for m in model_names]
	r2_scores = [results[m]['r2'] for m in model_names]

	# Separate ML and time-series models for visualization
	ml_models = [m for m in model_names if m not in ['ARIMA', 'Prophet']]
	ts_models = [m for m in model_names if m in ['ARIMA', 'Prophet']]

	fig, axes = plt.subplots(1, 3, figsize=(18, 5))

	# Color code: ML models in blue tones, TS models in orange/red tones
	colors = []
	for m in model_names:
	if m in ts_models:
	colors.append('coral' if m == 'ARIMA' else 'salmon')
	else:
	colors.append('skyblue')

	# MAE comparison
	axes[0].bar(model_names, mae_scores, color=colors, alpha=0.7)
	axes[0].set_title('MAE Comparison (Lower is Better)', fontsize=14, fontweight='bold')
	axes[0].set_ylabel('MAE', fontsize=12)
	axes[0].tick_params(axis='x', rotation=45)
	axes[0].grid(True, alpha=0.3, axis='y')
	# Add legend
	from matplotlib.patches import Patch
	legend_elements = [
	Patch(facecolor='skyblue', alpha=0.7, label='ML Models'),
	Patch(facecolor='coral', alpha=0.7, label='Time-Series Models')
	]
	axes[0].legend(handles=legend_elements, loc='upper right')

	# RMSE comparison
	axes[1].bar(model_names, rmse_scores, color=colors, alpha=0.7)
	axes[1].set_title('RMSE Comparison (Lower is Better)', fontsize=14, fontweight='bold')
	axes[1].set_ylabel('RMSE', fontsize=12)
	axes[1].tick_params(axis='x', rotation=45)
	axes[1].grid(True, alpha=0.3, axis='y')

	# R2 comparison
	axes[2].bar(model_names, r2_scores, color=colors, alpha=0.7)
	axes[2].set_title('R2 Score Comparison (Higher is Better)', fontsize=14, fontweight='bold')
	axes[2].set_ylabel('R2 Score', fontsize=12)
	axes[2].tick_params(axis='x', rotation=45)
	axes[2].grid(True, alpha=0.3, axis='y')

	plt.tight_layout()
	plt.savefig(f'{PLOTS_DIR}/model_comparison.png', dpi=300, bbox_inches='tight')
	print(f" Saved: {PLOTS_DIR}/model_comparison.png")
	plt.close()

	# 5. Time-series predictions plot (if time-series models available)
	if ts_models:
	print("5. Plotting time-series model predictions...")
	fig, axes = plt.subplots(len(ts_models), 1, figsize=(14, 6*len(ts_models)))
	if len(ts_models) == 1:
	axes = [axes]

	for idx, model_name in enumerate(ts_models):
	if model_name in results and 'dates' in results[model_name]:
	dates = pd.to_datetime(results[model_name]['dates'])
	actual = results[model_name]['actual']
	predictions = results[model_name]['predictions']

	axes[idx].plot(dates, actual, label='Actual', linewidth=2, alpha=0.7)
	axes[idx].plot(dates, predictions, label='Predicted', linewidth=2, alpha=0.7, linestyle='--')
	axes[idx].set_title(f'{model_name} - Actual vs Predicted', fontsize=14, fontweight='bold')
	axes[idx].set_xlabel('Date', fontsize=12)
	axes[idx].set_ylabel('Sales Quantity', fontsize=12)
	axes[idx].legend()
	axes[idx].grid(True, alpha=0.3)

	plt.tight_layout()
	plt.savefig(f'{PLOTS_DIR}/timeseries_predictions.png', dpi=300, bbox_inches='tight')
	print(f" Saved: {PLOTS_DIR}/timeseries_predictions.png")
	plt.close()

	print(" Visualization complete!")


	def save_model(model, encoders, scaler, feature_names, best_model_name, best_metrics):
	"""
	Save the trained model and preprocessing objects.

	Args:
	model: Trained model
	encoders: Dictionary of encoders
	scaler: Fitted scaler
	feature_names: List of feature names
	best_model_name: Name of the best model
	best_metrics: Dictionary of metrics
	"""
	print("\n" + "="*60)
	print("SAVING MODEL")
	print("="*60)

	# Save model
	model_path = f'{MODEL_DIR}/best_model.joblib'
	joblib.dump(model, model_path)
	print(f"Model saved to: {model_path}")

	# Save encoders and scaler
	preprocessing_path = f'{MODEL_DIR}/preprocessing.joblib'
	preprocessing_data = {
	'encoders': encoders,
	'scaler': scaler,
	'feature_names': feature_names
	}
	joblib.dump(preprocessing_data, preprocessing_path)
	print(f"Preprocessing objects saved to: {preprocessing_path}")

	# Save model metadata
	metadata = {
	'model_name': best_model_name,
	'metrics': best_metrics,
	'feature_names': feature_names,
	'saved_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
	}

	import json
	metadata_path = f'{MODEL_DIR}/model_metadata.json'
	with open(metadata_path, 'w') as f:
	json.dump(metadata, f, indent=4)
	print(f"Model metadata saved to: {metadata_path}")


	def main():
	"""
	Main function to orchestrate the training pipeline.
	"""
	print("\n" + "="*60)
	print("DEMAND PREDICTION SYSTEM - MODEL TRAINING")
	print("ML Models vs Time-Series Models Comparison")
	print("="*60)

	# Step 1: Load data
	df = load_data(DATA_PATH)

	# Step 2: Preprocess data
	df_processed = preprocess_data(df)

	# Step 3: Feature engineering for ML models
	X, y, feature_names, encoders, scaler = feature_engineering(df_processed)

	# Step 4: Split data for ML models (random split)
	print("\n" + "="*60)
	print("SPLITTING DATA FOR ML MODELS")
	print("="*60)
	X_train, X_val, y_train, y_val = train_test_split(
	X, y, test_size=0.2, random_state=42
	)
	print(f"Training set: {X_train.shape[0]} samples")
	print(f"Validation set: {X_val.shape[0]} samples")

	# Step 5: Train ML models
	print("\n" + "="*70)
	print("TRAINING MACHINE LEARNING MODELS")
	print("="*70)
	results = train_models(X_train, y_train, X_val, y_val)

	# Step 6: Prepare time-series data
	ts_data, train_size = prepare_time_series_data(df_processed)

	# Step 7: Train time-series models
	print("\n" + "="*70)
	print("TRAINING TIME-SERIES MODELS")
	print("="*70)

	# Train ARIMA
	if ARIMA_AVAILABLE:
	arima_results = train_arima(ts_data, train_size)
	if arima_results:
	results['ARIMA'] = arima_results
	else:
	print("\nARIMA skipped (statsmodels not available)")

	# Train Prophet
	if PROPHET_AVAILABLE:
	prophet_results = train_prophet(ts_data, train_size)
	if prophet_results:
	results['Prophet'] = prophet_results
	else:
	print("\nProphet skipped (prophet not available)")

	# Step 8: Select best model (across all model types)
	best_model_name, best_model, best_metrics = select_best_model(results)

	# Step 9: Visualize results
	visualize_results(df_processed, results, best_model_name, feature_names)

	# Step 10: Save model (only ML models can be saved with preprocessing)
	# For time-series models, save separately
	if best_model_name not in ['ARIMA', 'Prophet']:
	save_model(best_model, encoders, scaler, feature_names, best_model_name, best_metrics)
	else:
	# Save time-series model separately
	print("\n" + "="*60)
	print("SAVING TIME-SERIES MODEL")
	print("="*60)
	ts_model_path = f'{MODEL_DIR}/best_timeseries_model.joblib'
	joblib.dump(best_model, ts_model_path)
	print(f"Time-series model saved to: {ts_model_path}")

	# Also save preprocessing for ML models (in case user wants to use them)
	preprocessing_path = f'{MODEL_DIR}/preprocessing.joblib'
	preprocessing_data = {
	'encoders': encoders,
	'scaler': scaler,
	'feature_names': feature_names
	}
	joblib.dump(preprocessing_data, preprocessing_path)
	print(f"ML preprocessing objects saved to: {preprocessing_path}")

	# Save all results metadata
	import json
	all_models_metadata = {
	'best_model': best_model_name,
	'best_metrics': best_metrics,
	'all_models': {}
	}
	for model_name, model_results in results.items():
	all_models_metadata['all_models'][model_name] = {
	'mae': model_results['mae'],
	'rmse': model_results['rmse'],
	'r2': model_results['r2']
	}
	all_models_metadata['saved_at'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

	metadata_path = f'{MODEL_DIR}/all_models_metadata.json'
	with open(metadata_path, 'w') as f:
	json.dump(all_models_metadata, f, indent=4)
	print(f"All models metadata saved to: {metadata_path}")

	print("\n" + "="*60)
	print("TRAINING COMPLETE!")
	print("="*60)
	print(f"\nBest model: {best_model_name}")
	print(f"Model type: {'Time-Series' if best_model_name in ['ARIMA', 'Prophet'] else 'Machine Learning'}")
	print(f"Model saved to: {MODEL_DIR}/")
	print(f"Visualizations saved to: {PLOTS_DIR}/")
	print("\nYou can now use predict.py to make predictions!")


	if __name__ == "__main__":
	main()