|
|
"""
|
|
|
Demand Prediction System - Model Training Script
|
|
|
|
|
|
This script trains multiple machine learning and time-series models to predict
|
|
|
product demand (sales quantity) for an e-commerce platform.
|
|
|
|
|
|
Features:
|
|
|
- Data preprocessing and feature engineering
|
|
|
- Date feature extraction (day, month, day_of_week, weekend)
|
|
|
- Categorical encoding
|
|
|
- Feature scaling
|
|
|
- Multiple ML models (Linear Regression, Random Forest, XGBoost)
|
|
|
- Time-series models (ARIMA, Prophet)
|
|
|
- Model evaluation (MAE, RMSE, R2 Score)
|
|
|
- Automatic best model selection
|
|
|
- Model persistence using joblib
|
|
|
- Visualization of results
|
|
|
- Comparison between ML and time-series approaches
|
|
|
"""
|
|
|
|
|
|
import pandas as pd
|
|
|
import numpy as np
|
|
|
import matplotlib.pyplot as plt
|
|
|
import seaborn as sns
|
|
|
from datetime import datetime
|
|
|
import joblib
|
|
|
import os
|
|
|
import warnings
|
|
|
warnings.filterwarnings('ignore')
|
|
|
|
|
|
|
|
|
from sklearn.model_selection import train_test_split
|
|
|
from sklearn.preprocessing import StandardScaler, LabelEncoder
|
|
|
from sklearn.linear_model import LinearRegression
|
|
|
from sklearn.ensemble import RandomForestRegressor
|
|
|
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
|
|
|
|
|
|
|
|
|
try:
|
|
|
import xgboost as xgb
|
|
|
XGBOOST_AVAILABLE = True
|
|
|
except ImportError:
|
|
|
XGBOOST_AVAILABLE = False
|
|
|
print("XGBoost not available. Install with: pip install xgboost")
|
|
|
|
|
|
|
|
|
try:
|
|
|
from statsmodels.tsa.arima.model import ARIMA
|
|
|
from statsmodels.tsa.stattools import adfuller
|
|
|
ARIMA_AVAILABLE = True
|
|
|
except ImportError:
|
|
|
ARIMA_AVAILABLE = False
|
|
|
print("statsmodels not available. Install with: pip install statsmodels")
|
|
|
|
|
|
try:
|
|
|
from prophet import Prophet
|
|
|
PROPHET_AVAILABLE = True
|
|
|
except ImportError:
|
|
|
PROPHET_AVAILABLE = False
|
|
|
print("Prophet not available. Install with: pip install prophet")
|
|
|
|
|
|
|
|
|
np.random.seed(42)
|
|
|
|
|
|
|
|
|
DATA_PATH = 'data/sales.csv'
|
|
|
MODEL_DIR = 'models'
|
|
|
PLOTS_DIR = 'plots'
|
|
|
|
|
|
|
|
|
os.makedirs(MODEL_DIR, exist_ok=True)
|
|
|
os.makedirs(PLOTS_DIR, exist_ok=True)
|
|
|
|
|
|
|
|
|
def load_data(file_path):
|
|
|
"""
|
|
|
Load the sales dataset from CSV file.
|
|
|
|
|
|
Args:
|
|
|
file_path: Path to the CSV file
|
|
|
|
|
|
Returns:
|
|
|
DataFrame: Loaded dataset
|
|
|
"""
|
|
|
print(f"Loading data from {file_path}...")
|
|
|
df = pd.read_csv(file_path)
|
|
|
print(f"Data loaded successfully! Shape: {df.shape}")
|
|
|
return df
|
|
|
|
|
|
|
|
|
def preprocess_data(df):
|
|
|
"""
|
|
|
Preprocess the data: convert date, extract features, handle missing values.
|
|
|
|
|
|
Args:
|
|
|
df: Raw DataFrame
|
|
|
|
|
|
Returns:
|
|
|
DataFrame: Preprocessed DataFrame
|
|
|
"""
|
|
|
print("\n" + "="*60)
|
|
|
print("PREPROCESSING DATA")
|
|
|
print("="*60)
|
|
|
|
|
|
|
|
|
df = df.copy()
|
|
|
|
|
|
|
|
|
df['date'] = pd.to_datetime(df['date'])
|
|
|
|
|
|
|
|
|
print("Extracting date features...")
|
|
|
df['day'] = df['date'].dt.day
|
|
|
df['month'] = df['date'].dt.month
|
|
|
df['day_of_week'] = df['date'].dt.dayofweek
|
|
|
df['weekend'] = (df['day_of_week'] >= 5).astype(int)
|
|
|
df['year'] = df['date'].dt.year
|
|
|
df['quarter'] = df['date'].dt.quarter
|
|
|
|
|
|
|
|
|
print("\nMissing values:")
|
|
|
missing = df.isnull().sum()
|
|
|
print(missing[missing > 0])
|
|
|
|
|
|
if missing.sum() > 0:
|
|
|
print("Filling missing values...")
|
|
|
df = df.fillna(df.median(numeric_only=True))
|
|
|
|
|
|
|
|
|
print("\nDataset Info:")
|
|
|
print(f"Shape: {df.shape}")
|
|
|
print(f"\nColumns: {df.columns.tolist()}")
|
|
|
print(f"\nData types:\n{df.dtypes}")
|
|
|
print(f"\nBasic statistics:\n{df.describe()}")
|
|
|
|
|
|
return df
|
|
|
|
|
|
|
|
|
def feature_engineering(df):
|
|
|
"""
|
|
|
Perform feature engineering: encode categorical variables, scale features.
|
|
|
|
|
|
Args:
|
|
|
df: Preprocessed DataFrame
|
|
|
|
|
|
Returns:
|
|
|
tuple: (X_features, y_target, feature_names, encoders, scaler)
|
|
|
"""
|
|
|
print("\n" + "="*60)
|
|
|
print("FEATURE ENGINEERING")
|
|
|
print("="*60)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
feature_columns = ['product_id', 'price', 'discount', 'category',
|
|
|
'day', 'month', 'day_of_week', 'weekend', 'year', 'quarter']
|
|
|
|
|
|
X = df[feature_columns].copy()
|
|
|
y = df['sales_quantity'].copy()
|
|
|
|
|
|
|
|
|
print("Encoding categorical variables...")
|
|
|
|
|
|
|
|
|
category_encoder = LabelEncoder()
|
|
|
X['category_encoded'] = category_encoder.fit_transform(X['category'])
|
|
|
|
|
|
|
|
|
product_encoder = LabelEncoder()
|
|
|
X['product_id_encoded'] = product_encoder.fit_transform(X['product_id'])
|
|
|
|
|
|
|
|
|
X = X.drop(['category', 'product_id'], axis=1)
|
|
|
|
|
|
|
|
|
feature_names = X.columns.tolist()
|
|
|
|
|
|
print(f"Features after encoding: {feature_names}")
|
|
|
print(f"Number of features: {len(feature_names)}")
|
|
|
|
|
|
|
|
|
print("\nScaling numerical features...")
|
|
|
scaler = StandardScaler()
|
|
|
X_scaled = scaler.fit_transform(X)
|
|
|
X_scaled = pd.DataFrame(X_scaled, columns=feature_names)
|
|
|
|
|
|
|
|
|
encoders = {
|
|
|
'category': category_encoder,
|
|
|
'product_id': product_encoder,
|
|
|
'scaler': scaler
|
|
|
}
|
|
|
|
|
|
return X_scaled, y, feature_names, encoders, scaler
|
|
|
|
|
|
|
|
|
def train_models(X_train, y_train, X_val, y_val):
|
|
|
"""
|
|
|
Train multiple models and return their performance metrics.
|
|
|
|
|
|
Args:
|
|
|
X_train: Training features
|
|
|
y_train: Training target
|
|
|
X_val: Validation features
|
|
|
y_val: Validation target
|
|
|
|
|
|
Returns:
|
|
|
dict: Dictionary containing models and their metrics
|
|
|
"""
|
|
|
print("\n" + "="*60)
|
|
|
print("TRAINING MODELS")
|
|
|
print("="*60)
|
|
|
|
|
|
models = {}
|
|
|
results = {}
|
|
|
|
|
|
|
|
|
print("\n1. Training Linear Regression...")
|
|
|
lr_model = LinearRegression()
|
|
|
lr_model.fit(X_train, y_train)
|
|
|
lr_pred = lr_model.predict(X_val)
|
|
|
|
|
|
lr_mae = mean_absolute_error(y_val, lr_pred)
|
|
|
lr_rmse = np.sqrt(mean_squared_error(y_val, lr_pred))
|
|
|
lr_r2 = r2_score(y_val, lr_pred)
|
|
|
|
|
|
models['Linear Regression'] = lr_model
|
|
|
results['Linear Regression'] = {
|
|
|
'model': lr_model,
|
|
|
'mae': lr_mae,
|
|
|
'rmse': lr_rmse,
|
|
|
'r2': lr_r2,
|
|
|
'predictions': lr_pred
|
|
|
}
|
|
|
|
|
|
print(f" MAE: {lr_mae:.2f}, RMSE: {lr_rmse:.2f}, R2: {lr_r2:.4f}")
|
|
|
|
|
|
|
|
|
print("\n2. Training Random Forest Regressor...")
|
|
|
rf_model = RandomForestRegressor(
|
|
|
n_estimators=100,
|
|
|
max_depth=15,
|
|
|
min_samples_split=5,
|
|
|
min_samples_leaf=2,
|
|
|
random_state=42,
|
|
|
n_jobs=-1
|
|
|
)
|
|
|
rf_model.fit(X_train, y_train)
|
|
|
rf_pred = rf_model.predict(X_val)
|
|
|
|
|
|
rf_mae = mean_absolute_error(y_val, rf_pred)
|
|
|
rf_rmse = np.sqrt(mean_squared_error(y_val, rf_pred))
|
|
|
rf_r2 = r2_score(y_val, rf_pred)
|
|
|
|
|
|
models['Random Forest'] = rf_model
|
|
|
results['Random Forest'] = {
|
|
|
'model': rf_model,
|
|
|
'mae': rf_mae,
|
|
|
'rmse': rf_rmse,
|
|
|
'r2': rf_r2,
|
|
|
'predictions': rf_pred
|
|
|
}
|
|
|
|
|
|
print(f" MAE: {rf_mae:.2f}, RMSE: {rf_rmse:.2f}, R2: {rf_r2:.4f}")
|
|
|
|
|
|
|
|
|
if XGBOOST_AVAILABLE:
|
|
|
print("\n3. Training XGBoost Regressor...")
|
|
|
xgb_model = xgb.XGBRegressor(
|
|
|
n_estimators=100,
|
|
|
max_depth=6,
|
|
|
learning_rate=0.1,
|
|
|
random_state=42,
|
|
|
n_jobs=-1
|
|
|
)
|
|
|
xgb_model.fit(X_train, y_train)
|
|
|
xgb_pred = xgb_model.predict(X_val)
|
|
|
|
|
|
xgb_mae = mean_absolute_error(y_val, xgb_pred)
|
|
|
xgb_rmse = np.sqrt(mean_squared_error(y_val, xgb_pred))
|
|
|
xgb_r2 = r2_score(y_val, xgb_pred)
|
|
|
|
|
|
models['XGBoost'] = xgb_model
|
|
|
results['XGBoost'] = {
|
|
|
'model': xgb_model,
|
|
|
'mae': xgb_mae,
|
|
|
'rmse': xgb_rmse,
|
|
|
'r2': xgb_r2,
|
|
|
'predictions': xgb_pred
|
|
|
}
|
|
|
|
|
|
print(f" MAE: {xgb_mae:.2f}, RMSE: {xgb_rmse:.2f}, R2: {xgb_r2:.4f}")
|
|
|
else:
|
|
|
print("\n3. XGBoost skipped (not available)")
|
|
|
|
|
|
return results
|
|
|
|
|
|
|
|
|
def prepare_time_series_data(df):
|
|
|
"""
|
|
|
Prepare time-series data by aggregating daily sales.
|
|
|
|
|
|
Args:
|
|
|
df: DataFrame with date and sales_quantity columns
|
|
|
|
|
|
Returns:
|
|
|
tuple: (ts_data, train_size) - time series data and training size
|
|
|
"""
|
|
|
print("\n" + "="*60)
|
|
|
print("PREPARING TIME-SERIES DATA")
|
|
|
print("="*60)
|
|
|
|
|
|
|
|
|
df['date'] = pd.to_datetime(df['date'])
|
|
|
ts_data = df.groupby('date')['sales_quantity'].sum().reset_index()
|
|
|
ts_data = ts_data.sort_values('date').reset_index(drop=True)
|
|
|
ts_data.columns = ['ds', 'y']
|
|
|
|
|
|
print(f"Time-series data shape: {ts_data.shape}")
|
|
|
print(f"Date range: {ts_data['ds'].min()} to {ts_data['ds'].max()}")
|
|
|
print(f"Total days: {len(ts_data)}")
|
|
|
|
|
|
|
|
|
train_size = int(len(ts_data) * 0.8)
|
|
|
|
|
|
return ts_data, train_size
|
|
|
|
|
|
|
|
|
def train_arima(ts_data, train_size):
|
|
|
"""
|
|
|
Train ARIMA model on time-series data.
|
|
|
|
|
|
Args:
|
|
|
ts_data: Time-series DataFrame with 'ds' and 'y' columns
|
|
|
train_size: Number of samples for training
|
|
|
|
|
|
Returns:
|
|
|
dict: Model results dictionary
|
|
|
"""
|
|
|
if not ARIMA_AVAILABLE:
|
|
|
return None
|
|
|
|
|
|
print("\n" + "="*60)
|
|
|
print("TRAINING ARIMA MODEL")
|
|
|
print("="*60)
|
|
|
|
|
|
try:
|
|
|
|
|
|
train_data = ts_data['y'].iloc[:train_size].values
|
|
|
val_data = ts_data['y'].iloc[train_size:].values
|
|
|
val_dates = ts_data['ds'].iloc[train_size:].values
|
|
|
|
|
|
print(f"Training on {len(train_data)} samples")
|
|
|
print(f"Validating on {len(val_data)} samples")
|
|
|
|
|
|
|
|
|
|
|
|
best_aic = np.inf
|
|
|
best_order = None
|
|
|
best_model = None
|
|
|
|
|
|
|
|
|
orders_to_try = [
|
|
|
(1, 1, 1),
|
|
|
(2, 1, 2),
|
|
|
(1, 1, 0),
|
|
|
(0, 1, 1),
|
|
|
(2, 1, 1),
|
|
|
(1, 1, 2),
|
|
|
]
|
|
|
|
|
|
print("Trying different ARIMA orders...")
|
|
|
for order in orders_to_try:
|
|
|
try:
|
|
|
model = ARIMA(train_data, order=order)
|
|
|
fitted_model = model.fit()
|
|
|
aic = fitted_model.aic
|
|
|
|
|
|
if aic < best_aic:
|
|
|
best_aic = aic
|
|
|
best_order = order
|
|
|
best_model = fitted_model
|
|
|
print(f" Order {order}: AIC = {aic:.2f} (best so far)")
|
|
|
else:
|
|
|
print(f" Order {order}: AIC = {aic:.2f}")
|
|
|
except Exception as e:
|
|
|
print(f" Order {order}: Failed - {str(e)[:50]}")
|
|
|
continue
|
|
|
|
|
|
if best_model is None:
|
|
|
print("Failed to fit ARIMA model with any order")
|
|
|
return None
|
|
|
|
|
|
print(f"\nBest ARIMA order: {best_order} (AIC: {best_aic:.2f})")
|
|
|
|
|
|
|
|
|
forecast_steps = len(val_data)
|
|
|
forecast = best_model.forecast(steps=forecast_steps)
|
|
|
|
|
|
|
|
|
forecast = np.maximum(forecast, 0)
|
|
|
|
|
|
|
|
|
mae = mean_absolute_error(val_data, forecast)
|
|
|
rmse = np.sqrt(mean_squared_error(val_data, forecast))
|
|
|
r2 = r2_score(val_data, forecast)
|
|
|
|
|
|
print(f" MAE: {mae:.2f}, RMSE: {rmse:.2f}, R2: {r2:.4f}")
|
|
|
|
|
|
return {
|
|
|
'model': best_model,
|
|
|
'order': best_order,
|
|
|
'mae': mae,
|
|
|
'rmse': rmse,
|
|
|
'r2': r2,
|
|
|
'predictions': forecast,
|
|
|
'actual': val_data,
|
|
|
'dates': val_dates
|
|
|
}
|
|
|
|
|
|
except Exception as e:
|
|
|
print(f"Error training ARIMA: {str(e)}")
|
|
|
return None
|
|
|
|
|
|
|
|
|
def train_prophet(ts_data, train_size):
|
|
|
"""
|
|
|
Train Prophet model on time-series data.
|
|
|
|
|
|
Args:
|
|
|
ts_data: Time-series DataFrame with 'ds' and 'y' columns
|
|
|
train_size: Number of samples for training
|
|
|
|
|
|
Returns:
|
|
|
dict: Model results dictionary
|
|
|
"""
|
|
|
if not PROPHET_AVAILABLE:
|
|
|
return None
|
|
|
|
|
|
print("\n" + "="*60)
|
|
|
print("TRAINING PROPHET MODEL")
|
|
|
print("="*60)
|
|
|
|
|
|
try:
|
|
|
|
|
|
train_data = ts_data.iloc[:train_size].copy()
|
|
|
val_data = ts_data.iloc[train_size:].copy()
|
|
|
|
|
|
print(f"Training on {len(train_data)} samples")
|
|
|
print(f"Validating on {len(val_data)} samples")
|
|
|
|
|
|
|
|
|
|
|
|
model = Prophet(
|
|
|
daily_seasonality=False,
|
|
|
weekly_seasonality=True,
|
|
|
yearly_seasonality=True,
|
|
|
seasonality_mode='multiplicative',
|
|
|
changepoint_prior_scale=0.05
|
|
|
)
|
|
|
|
|
|
print("Fitting Prophet model...")
|
|
|
model.fit(train_data)
|
|
|
|
|
|
|
|
|
future = model.make_future_dataframe(periods=len(val_data), freq='D')
|
|
|
|
|
|
|
|
|
forecast = model.predict(future)
|
|
|
|
|
|
|
|
|
val_forecast = forecast.iloc[train_size:]['yhat'].values
|
|
|
val_actual = val_data['y'].values
|
|
|
|
|
|
|
|
|
val_forecast = np.maximum(val_forecast, 0)
|
|
|
|
|
|
|
|
|
mae = mean_absolute_error(val_actual, val_forecast)
|
|
|
rmse = np.sqrt(mean_squared_error(val_actual, val_forecast))
|
|
|
r2 = r2_score(val_actual, val_forecast)
|
|
|
|
|
|
print(f" MAE: {mae:.2f}, RMSE: {rmse:.2f}, R2: {r2:.4f}")
|
|
|
|
|
|
return {
|
|
|
'model': model,
|
|
|
'mae': mae,
|
|
|
'rmse': rmse,
|
|
|
'r2': r2,
|
|
|
'predictions': val_forecast,
|
|
|
'actual': val_actual,
|
|
|
'dates': val_data['ds'].values,
|
|
|
'full_forecast': forecast
|
|
|
}
|
|
|
|
|
|
except Exception as e:
|
|
|
print(f"Error training Prophet: {str(e)}")
|
|
|
import traceback
|
|
|
traceback.print_exc()
|
|
|
return None
|
|
|
|
|
|
|
|
|
def select_best_model(results):
|
|
|
"""
|
|
|
Select the best model based on R2 score (higher is better).
|
|
|
|
|
|
Args:
|
|
|
results: Dictionary containing model results
|
|
|
|
|
|
Returns:
|
|
|
tuple: (best_model_name, best_model, best_metrics)
|
|
|
"""
|
|
|
print("\n" + "="*60)
|
|
|
print("MODEL COMPARISON")
|
|
|
print("="*60)
|
|
|
|
|
|
|
|
|
comparison_data = []
|
|
|
for model_name, metrics in results.items():
|
|
|
comparison_data.append({
|
|
|
'Model': model_name,
|
|
|
'MAE': metrics['mae'],
|
|
|
'RMSE': metrics['rmse'],
|
|
|
'R2 Score': metrics['r2']
|
|
|
})
|
|
|
|
|
|
comparison_df = pd.DataFrame(comparison_data)
|
|
|
print("\nModel Performance Comparison:")
|
|
|
print(comparison_df.to_string(index=False))
|
|
|
|
|
|
|
|
|
best_model_name = max(results.keys(), key=lambda x: results[x]['r2'])
|
|
|
best_model = results[best_model_name]['model']
|
|
|
best_metrics = {
|
|
|
'mae': results[best_model_name]['mae'],
|
|
|
'rmse': results[best_model_name]['rmse'],
|
|
|
'r2': results[best_model_name]['r2']
|
|
|
}
|
|
|
|
|
|
print(f"\n{'='*60}")
|
|
|
print(f"BEST MODEL: {best_model_name}")
|
|
|
print(f"MAE: {best_metrics['mae']:.2f}")
|
|
|
print(f"RMSE: {best_metrics['rmse']:.2f}")
|
|
|
print(f"R2 Score: {best_metrics['r2']:.4f}")
|
|
|
print(f"{'='*60}")
|
|
|
|
|
|
return best_model_name, best_model, best_metrics
|
|
|
|
|
|
|
|
|
def visualize_results(df, results, best_model_name, feature_names):
|
|
|
"""
|
|
|
Create visualizations: demand trends, feature importance, model comparison.
|
|
|
|
|
|
Args:
|
|
|
df: Original DataFrame
|
|
|
results: Model results dictionary
|
|
|
best_model_name: Name of the best model
|
|
|
feature_names: List of feature names
|
|
|
"""
|
|
|
print("\n" + "="*60)
|
|
|
print("GENERATING VISUALIZATIONS")
|
|
|
print("="*60)
|
|
|
|
|
|
|
|
|
sns.set_style("whitegrid")
|
|
|
plt.rcParams['figure.figsize'] = (12, 6)
|
|
|
|
|
|
|
|
|
print("1. Plotting demand trends over time...")
|
|
|
df['date'] = pd.to_datetime(df['date'])
|
|
|
daily_demand = df.groupby('date')['sales_quantity'].sum().reset_index()
|
|
|
|
|
|
plt.figure(figsize=(14, 6))
|
|
|
plt.plot(daily_demand['date'], daily_demand['sales_quantity'], linewidth=1, alpha=0.7)
|
|
|
plt.title('Total Daily Sales Quantity Over Time', fontsize=16, fontweight='bold')
|
|
|
plt.xlabel('Date', fontsize=12)
|
|
|
plt.ylabel('Total Sales Quantity', fontsize=12)
|
|
|
plt.grid(True, alpha=0.3)
|
|
|
plt.tight_layout()
|
|
|
plt.savefig(f'{PLOTS_DIR}/demand_trends.png', dpi=300, bbox_inches='tight')
|
|
|
print(f" Saved: {PLOTS_DIR}/demand_trends.png")
|
|
|
plt.close()
|
|
|
|
|
|
|
|
|
print("2. Plotting monthly average demand...")
|
|
|
df['month_name'] = pd.to_datetime(df['date']).dt.strftime('%B')
|
|
|
monthly_avg = df.groupby('month')['sales_quantity'].mean().reset_index()
|
|
|
month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
|
|
|
'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
|
|
|
monthly_avg['month_name'] = monthly_avg['month'].apply(lambda x: month_names[x-1])
|
|
|
|
|
|
plt.figure(figsize=(12, 6))
|
|
|
plt.bar(monthly_avg['month_name'], monthly_avg['sales_quantity'], color='steelblue', alpha=0.7)
|
|
|
plt.title('Average Sales Quantity by Month', fontsize=16, fontweight='bold')
|
|
|
plt.xlabel('Month', fontsize=12)
|
|
|
plt.ylabel('Average Sales Quantity', fontsize=12)
|
|
|
plt.xticks(rotation=45)
|
|
|
plt.grid(True, alpha=0.3, axis='y')
|
|
|
plt.tight_layout()
|
|
|
plt.savefig(f'{PLOTS_DIR}/monthly_demand.png', dpi=300, bbox_inches='tight')
|
|
|
print(f" Saved: {PLOTS_DIR}/monthly_demand.png")
|
|
|
plt.close()
|
|
|
|
|
|
|
|
|
print("3. Plotting feature importance...")
|
|
|
best_model = results[best_model_name]['model']
|
|
|
|
|
|
if hasattr(best_model, 'feature_importances_'):
|
|
|
importances = best_model.feature_importances_
|
|
|
feature_importance_df = pd.DataFrame({
|
|
|
'feature': feature_names,
|
|
|
'importance': importances
|
|
|
}).sort_values('importance', ascending=False)
|
|
|
|
|
|
plt.figure(figsize=(10, 6))
|
|
|
plt.barh(feature_importance_df['feature'], feature_importance_df['importance'], color='coral', alpha=0.7)
|
|
|
plt.title(f'Feature Importance - {best_model_name}', fontsize=16, fontweight='bold')
|
|
|
plt.xlabel('Importance', fontsize=12)
|
|
|
plt.ylabel('Feature', fontsize=12)
|
|
|
plt.gca().invert_yaxis()
|
|
|
plt.grid(True, alpha=0.3, axis='x')
|
|
|
plt.tight_layout()
|
|
|
plt.savefig(f'{PLOTS_DIR}/feature_importance.png', dpi=300, bbox_inches='tight')
|
|
|
print(f" Saved: {PLOTS_DIR}/feature_importance.png")
|
|
|
plt.close()
|
|
|
else:
|
|
|
print(" Feature importance not available for this model type")
|
|
|
|
|
|
|
|
|
print("4. Plotting model comparison...")
|
|
|
model_names = list(results.keys())
|
|
|
mae_scores = [results[m]['mae'] for m in model_names]
|
|
|
rmse_scores = [results[m]['rmse'] for m in model_names]
|
|
|
r2_scores = [results[m]['r2'] for m in model_names]
|
|
|
|
|
|
|
|
|
ml_models = [m for m in model_names if m not in ['ARIMA', 'Prophet']]
|
|
|
ts_models = [m for m in model_names if m in ['ARIMA', 'Prophet']]
|
|
|
|
|
|
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
|
|
|
|
|
|
|
|
|
colors = []
|
|
|
for m in model_names:
|
|
|
if m in ts_models:
|
|
|
colors.append('coral' if m == 'ARIMA' else 'salmon')
|
|
|
else:
|
|
|
colors.append('skyblue')
|
|
|
|
|
|
|
|
|
axes[0].bar(model_names, mae_scores, color=colors, alpha=0.7)
|
|
|
axes[0].set_title('MAE Comparison (Lower is Better)', fontsize=14, fontweight='bold')
|
|
|
axes[0].set_ylabel('MAE', fontsize=12)
|
|
|
axes[0].tick_params(axis='x', rotation=45)
|
|
|
axes[0].grid(True, alpha=0.3, axis='y')
|
|
|
|
|
|
from matplotlib.patches import Patch
|
|
|
legend_elements = [
|
|
|
Patch(facecolor='skyblue', alpha=0.7, label='ML Models'),
|
|
|
Patch(facecolor='coral', alpha=0.7, label='Time-Series Models')
|
|
|
]
|
|
|
axes[0].legend(handles=legend_elements, loc='upper right')
|
|
|
|
|
|
|
|
|
axes[1].bar(model_names, rmse_scores, color=colors, alpha=0.7)
|
|
|
axes[1].set_title('RMSE Comparison (Lower is Better)', fontsize=14, fontweight='bold')
|
|
|
axes[1].set_ylabel('RMSE', fontsize=12)
|
|
|
axes[1].tick_params(axis='x', rotation=45)
|
|
|
axes[1].grid(True, alpha=0.3, axis='y')
|
|
|
|
|
|
|
|
|
axes[2].bar(model_names, r2_scores, color=colors, alpha=0.7)
|
|
|
axes[2].set_title('R2 Score Comparison (Higher is Better)', fontsize=14, fontweight='bold')
|
|
|
axes[2].set_ylabel('R2 Score', fontsize=12)
|
|
|
axes[2].tick_params(axis='x', rotation=45)
|
|
|
axes[2].grid(True, alpha=0.3, axis='y')
|
|
|
|
|
|
plt.tight_layout()
|
|
|
plt.savefig(f'{PLOTS_DIR}/model_comparison.png', dpi=300, bbox_inches='tight')
|
|
|
print(f" Saved: {PLOTS_DIR}/model_comparison.png")
|
|
|
plt.close()
|
|
|
|
|
|
|
|
|
if ts_models:
|
|
|
print("5. Plotting time-series model predictions...")
|
|
|
fig, axes = plt.subplots(len(ts_models), 1, figsize=(14, 6*len(ts_models)))
|
|
|
if len(ts_models) == 1:
|
|
|
axes = [axes]
|
|
|
|
|
|
for idx, model_name in enumerate(ts_models):
|
|
|
if model_name in results and 'dates' in results[model_name]:
|
|
|
dates = pd.to_datetime(results[model_name]['dates'])
|
|
|
actual = results[model_name]['actual']
|
|
|
predictions = results[model_name]['predictions']
|
|
|
|
|
|
axes[idx].plot(dates, actual, label='Actual', linewidth=2, alpha=0.7)
|
|
|
axes[idx].plot(dates, predictions, label='Predicted', linewidth=2, alpha=0.7, linestyle='--')
|
|
|
axes[idx].set_title(f'{model_name} - Actual vs Predicted', fontsize=14, fontweight='bold')
|
|
|
axes[idx].set_xlabel('Date', fontsize=12)
|
|
|
axes[idx].set_ylabel('Sales Quantity', fontsize=12)
|
|
|
axes[idx].legend()
|
|
|
axes[idx].grid(True, alpha=0.3)
|
|
|
|
|
|
plt.tight_layout()
|
|
|
plt.savefig(f'{PLOTS_DIR}/timeseries_predictions.png', dpi=300, bbox_inches='tight')
|
|
|
print(f" Saved: {PLOTS_DIR}/timeseries_predictions.png")
|
|
|
plt.close()
|
|
|
|
|
|
print(" Visualization complete!")
|
|
|
|
|
|
|
|
|
def save_model(model, encoders, scaler, feature_names, best_model_name, best_metrics):
|
|
|
"""
|
|
|
Save the trained model and preprocessing objects.
|
|
|
|
|
|
Args:
|
|
|
model: Trained model
|
|
|
encoders: Dictionary of encoders
|
|
|
scaler: Fitted scaler
|
|
|
feature_names: List of feature names
|
|
|
best_model_name: Name of the best model
|
|
|
best_metrics: Dictionary of metrics
|
|
|
"""
|
|
|
print("\n" + "="*60)
|
|
|
print("SAVING MODEL")
|
|
|
print("="*60)
|
|
|
|
|
|
|
|
|
model_path = f'{MODEL_DIR}/best_model.joblib'
|
|
|
joblib.dump(model, model_path)
|
|
|
print(f"Model saved to: {model_path}")
|
|
|
|
|
|
|
|
|
preprocessing_path = f'{MODEL_DIR}/preprocessing.joblib'
|
|
|
preprocessing_data = {
|
|
|
'encoders': encoders,
|
|
|
'scaler': scaler,
|
|
|
'feature_names': feature_names
|
|
|
}
|
|
|
joblib.dump(preprocessing_data, preprocessing_path)
|
|
|
print(f"Preprocessing objects saved to: {preprocessing_path}")
|
|
|
|
|
|
|
|
|
metadata = {
|
|
|
'model_name': best_model_name,
|
|
|
'metrics': best_metrics,
|
|
|
'feature_names': feature_names,
|
|
|
'saved_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
|
|
}
|
|
|
|
|
|
import json
|
|
|
metadata_path = f'{MODEL_DIR}/model_metadata.json'
|
|
|
with open(metadata_path, 'w') as f:
|
|
|
json.dump(metadata, f, indent=4)
|
|
|
print(f"Model metadata saved to: {metadata_path}")
|
|
|
|
|
|
|
|
|
def main():
|
|
|
"""
|
|
|
Main function to orchestrate the training pipeline.
|
|
|
"""
|
|
|
print("\n" + "="*60)
|
|
|
print("DEMAND PREDICTION SYSTEM - MODEL TRAINING")
|
|
|
print("ML Models vs Time-Series Models Comparison")
|
|
|
print("="*60)
|
|
|
|
|
|
|
|
|
df = load_data(DATA_PATH)
|
|
|
|
|
|
|
|
|
df_processed = preprocess_data(df)
|
|
|
|
|
|
|
|
|
X, y, feature_names, encoders, scaler = feature_engineering(df_processed)
|
|
|
|
|
|
|
|
|
print("\n" + "="*60)
|
|
|
print("SPLITTING DATA FOR ML MODELS")
|
|
|
print("="*60)
|
|
|
X_train, X_val, y_train, y_val = train_test_split(
|
|
|
X, y, test_size=0.2, random_state=42
|
|
|
)
|
|
|
print(f"Training set: {X_train.shape[0]} samples")
|
|
|
print(f"Validation set: {X_val.shape[0]} samples")
|
|
|
|
|
|
|
|
|
print("\n" + "="*70)
|
|
|
print("TRAINING MACHINE LEARNING MODELS")
|
|
|
print("="*70)
|
|
|
results = train_models(X_train, y_train, X_val, y_val)
|
|
|
|
|
|
|
|
|
ts_data, train_size = prepare_time_series_data(df_processed)
|
|
|
|
|
|
|
|
|
print("\n" + "="*70)
|
|
|
print("TRAINING TIME-SERIES MODELS")
|
|
|
print("="*70)
|
|
|
|
|
|
|
|
|
if ARIMA_AVAILABLE:
|
|
|
arima_results = train_arima(ts_data, train_size)
|
|
|
if arima_results:
|
|
|
results['ARIMA'] = arima_results
|
|
|
else:
|
|
|
print("\nARIMA skipped (statsmodels not available)")
|
|
|
|
|
|
|
|
|
if PROPHET_AVAILABLE:
|
|
|
prophet_results = train_prophet(ts_data, train_size)
|
|
|
if prophet_results:
|
|
|
results['Prophet'] = prophet_results
|
|
|
else:
|
|
|
print("\nProphet skipped (prophet not available)")
|
|
|
|
|
|
|
|
|
best_model_name, best_model, best_metrics = select_best_model(results)
|
|
|
|
|
|
|
|
|
visualize_results(df_processed, results, best_model_name, feature_names)
|
|
|
|
|
|
|
|
|
|
|
|
if best_model_name not in ['ARIMA', 'Prophet']:
|
|
|
save_model(best_model, encoders, scaler, feature_names, best_model_name, best_metrics)
|
|
|
else:
|
|
|
|
|
|
print("\n" + "="*60)
|
|
|
print("SAVING TIME-SERIES MODEL")
|
|
|
print("="*60)
|
|
|
ts_model_path = f'{MODEL_DIR}/best_timeseries_model.joblib'
|
|
|
joblib.dump(best_model, ts_model_path)
|
|
|
print(f"Time-series model saved to: {ts_model_path}")
|
|
|
|
|
|
|
|
|
preprocessing_path = f'{MODEL_DIR}/preprocessing.joblib'
|
|
|
preprocessing_data = {
|
|
|
'encoders': encoders,
|
|
|
'scaler': scaler,
|
|
|
'feature_names': feature_names
|
|
|
}
|
|
|
joblib.dump(preprocessing_data, preprocessing_path)
|
|
|
print(f"ML preprocessing objects saved to: {preprocessing_path}")
|
|
|
|
|
|
|
|
|
import json
|
|
|
all_models_metadata = {
|
|
|
'best_model': best_model_name,
|
|
|
'best_metrics': best_metrics,
|
|
|
'all_models': {}
|
|
|
}
|
|
|
for model_name, model_results in results.items():
|
|
|
all_models_metadata['all_models'][model_name] = {
|
|
|
'mae': model_results['mae'],
|
|
|
'rmse': model_results['rmse'],
|
|
|
'r2': model_results['r2']
|
|
|
}
|
|
|
all_models_metadata['saved_at'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
|
|
|
|
|
|
metadata_path = f'{MODEL_DIR}/all_models_metadata.json'
|
|
|
with open(metadata_path, 'w') as f:
|
|
|
json.dump(all_models_metadata, f, indent=4)
|
|
|
print(f"All models metadata saved to: {metadata_path}")
|
|
|
|
|
|
print("\n" + "="*60)
|
|
|
print("TRAINING COMPLETE!")
|
|
|
print("="*60)
|
|
|
print(f"\nBest model: {best_model_name}")
|
|
|
print(f"Model type: {'Time-Series' if best_model_name in ['ARIMA', 'Prophet'] else 'Machine Learning'}")
|
|
|
print(f"Model saved to: {MODEL_DIR}/")
|
|
|
print(f"Visualizations saved to: {PLOTS_DIR}/")
|
|
|
print("\nYou can now use predict.py to make predictions!")
|
|
|
|
|
|
|
|
|
if __name__ == "__main__":
|
|
|
main()
|
|
|
|