vikaswebdev's picture
Upload 17 files
7f90ea0 verified
"""
Demand Prediction System - Model Training Script
This script trains multiple machine learning and time-series models to predict
product demand (sales quantity) for an e-commerce platform.
Features:
- Data preprocessing and feature engineering
- Date feature extraction (day, month, day_of_week, weekend)
- Categorical encoding
- Feature scaling
- Multiple ML models (Linear Regression, Random Forest, XGBoost)
- Time-series models (ARIMA, Prophet)
- Model evaluation (MAE, RMSE, R2 Score)
- Automatic best model selection
- Model persistence using joblib
- Visualization of results
- Comparison between ML and time-series approaches
"""
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import joblib
import os
import warnings
warnings.filterwarnings('ignore')
# Machine Learning imports
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
# Try to import XGBoost (optional)
try:
import xgboost as xgb
XGBOOST_AVAILABLE = True
except ImportError:
XGBOOST_AVAILABLE = False
print("XGBoost not available. Install with: pip install xgboost")
# Try to import time-series libraries
try:
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.stattools import adfuller
ARIMA_AVAILABLE = True
except ImportError:
ARIMA_AVAILABLE = False
print("statsmodels not available. Install with: pip install statsmodels")
try:
from prophet import Prophet
PROPHET_AVAILABLE = True
except ImportError:
PROPHET_AVAILABLE = False
print("Prophet not available. Install with: pip install prophet")
# Set random seeds for reproducibility
np.random.seed(42)
# Configuration
DATA_PATH = 'data/sales.csv'
MODEL_DIR = 'models'
PLOTS_DIR = 'plots'
# Create directories if they don't exist
os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(PLOTS_DIR, exist_ok=True)
def load_data(file_path):
"""
Load the sales dataset from CSV file.
Args:
file_path: Path to the CSV file
Returns:
DataFrame: Loaded dataset
"""
print(f"Loading data from {file_path}...")
df = pd.read_csv(file_path)
print(f"Data loaded successfully! Shape: {df.shape}")
return df
def preprocess_data(df):
"""
Preprocess the data: convert date, extract features, handle missing values.
Args:
df: Raw DataFrame
Returns:
DataFrame: Preprocessed DataFrame
"""
print("\n" + "="*60)
print("PREPROCESSING DATA")
print("="*60)
# Create a copy to avoid modifying original
df = df.copy()
# Convert date column to datetime
df['date'] = pd.to_datetime(df['date'])
# Extract date features
print("Extracting date features...")
df['day'] = df['date'].dt.day
df['month'] = df['date'].dt.month
df['day_of_week'] = df['date'].dt.dayofweek # 0=Monday, 6=Sunday
df['weekend'] = (df['day_of_week'] >= 5).astype(int) # 1 if weekend, 0 otherwise
df['year'] = df['date'].dt.year
df['quarter'] = df['date'].dt.quarter
# Check for missing values
print("\nMissing values:")
missing = df.isnull().sum()
print(missing[missing > 0])
if missing.sum() > 0:
print("Filling missing values...")
df = df.fillna(df.median(numeric_only=True))
# Display basic statistics
print("\nDataset Info:")
print(f"Shape: {df.shape}")
print(f"\nColumns: {df.columns.tolist()}")
print(f"\nData types:\n{df.dtypes}")
print(f"\nBasic statistics:\n{df.describe()}")
return df
def feature_engineering(df):
"""
Perform feature engineering: encode categorical variables, scale features.
Args:
df: Preprocessed DataFrame
Returns:
tuple: (X_features, y_target, feature_names, encoders, scaler)
"""
print("\n" + "="*60)
print("FEATURE ENGINEERING")
print("="*60)
# Separate features and target
# Drop original date column (we have extracted features from it)
# Keep product_id for now (we'll encode it)
feature_columns = ['product_id', 'price', 'discount', 'category',
'day', 'month', 'day_of_week', 'weekend', 'year', 'quarter']
X = df[feature_columns].copy()
y = df['sales_quantity'].copy()
# Encode categorical variables
print("Encoding categorical variables...")
# Label encode category
category_encoder = LabelEncoder()
X['category_encoded'] = category_encoder.fit_transform(X['category'])
# Label encode product_id (treating it as categorical)
product_encoder = LabelEncoder()
X['product_id_encoded'] = product_encoder.fit_transform(X['product_id'])
# Drop original categorical columns
X = X.drop(['category', 'product_id'], axis=1)
# Get feature names
feature_names = X.columns.tolist()
print(f"Features after encoding: {feature_names}")
print(f"Number of features: {len(feature_names)}")
# Scale numerical features
print("\nScaling numerical features...")
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled = pd.DataFrame(X_scaled, columns=feature_names)
# Store encoders and scaler for later use
encoders = {
'category': category_encoder,
'product_id': product_encoder,
'scaler': scaler
}
return X_scaled, y, feature_names, encoders, scaler
def train_models(X_train, y_train, X_val, y_val):
"""
Train multiple models and return their performance metrics.
Args:
X_train: Training features
y_train: Training target
X_val: Validation features
y_val: Validation target
Returns:
dict: Dictionary containing models and their metrics
"""
print("\n" + "="*60)
print("TRAINING MODELS")
print("="*60)
models = {}
results = {}
# 1. Linear Regression
print("\n1. Training Linear Regression...")
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
lr_pred = lr_model.predict(X_val)
lr_mae = mean_absolute_error(y_val, lr_pred)
lr_rmse = np.sqrt(mean_squared_error(y_val, lr_pred))
lr_r2 = r2_score(y_val, lr_pred)
models['Linear Regression'] = lr_model
results['Linear Regression'] = {
'model': lr_model,
'mae': lr_mae,
'rmse': lr_rmse,
'r2': lr_r2,
'predictions': lr_pred
}
print(f" MAE: {lr_mae:.2f}, RMSE: {lr_rmse:.2f}, R2: {lr_r2:.4f}")
# 2. Random Forest Regressor
print("\n2. Training Random Forest Regressor...")
rf_model = RandomForestRegressor(
n_estimators=100,
max_depth=15,
min_samples_split=5,
min_samples_leaf=2,
random_state=42,
n_jobs=-1
)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_val)
rf_mae = mean_absolute_error(y_val, rf_pred)
rf_rmse = np.sqrt(mean_squared_error(y_val, rf_pred))
rf_r2 = r2_score(y_val, rf_pred)
models['Random Forest'] = rf_model
results['Random Forest'] = {
'model': rf_model,
'mae': rf_mae,
'rmse': rf_rmse,
'r2': rf_r2,
'predictions': rf_pred
}
print(f" MAE: {rf_mae:.2f}, RMSE: {rf_rmse:.2f}, R2: {rf_r2:.4f}")
# 3. XGBoost (if available)
if XGBOOST_AVAILABLE:
print("\n3. Training XGBoost Regressor...")
xgb_model = xgb.XGBRegressor(
n_estimators=100,
max_depth=6,
learning_rate=0.1,
random_state=42,
n_jobs=-1
)
xgb_model.fit(X_train, y_train)
xgb_pred = xgb_model.predict(X_val)
xgb_mae = mean_absolute_error(y_val, xgb_pred)
xgb_rmse = np.sqrt(mean_squared_error(y_val, xgb_pred))
xgb_r2 = r2_score(y_val, xgb_pred)
models['XGBoost'] = xgb_model
results['XGBoost'] = {
'model': xgb_model,
'mae': xgb_mae,
'rmse': xgb_rmse,
'r2': xgb_r2,
'predictions': xgb_pred
}
print(f" MAE: {xgb_mae:.2f}, RMSE: {xgb_rmse:.2f}, R2: {xgb_r2:.4f}")
else:
print("\n3. XGBoost skipped (not available)")
return results
def prepare_time_series_data(df):
"""
Prepare time-series data by aggregating daily sales.
Args:
df: DataFrame with date and sales_quantity columns
Returns:
tuple: (ts_data, train_size) - time series data and training size
"""
print("\n" + "="*60)
print("PREPARING TIME-SERIES DATA")
print("="*60)
# Aggregate by date
df['date'] = pd.to_datetime(df['date'])
ts_data = df.groupby('date')['sales_quantity'].sum().reset_index()
ts_data = ts_data.sort_values('date').reset_index(drop=True)
ts_data.columns = ['ds', 'y'] # Prophet expects 'ds' and 'y'
print(f"Time-series data shape: {ts_data.shape}")
print(f"Date range: {ts_data['ds'].min()} to {ts_data['ds'].max()}")
print(f"Total days: {len(ts_data)}")
# Use 80% for training (chronological split for time-series)
train_size = int(len(ts_data) * 0.8)
return ts_data, train_size
def train_arima(ts_data, train_size):
"""
Train ARIMA model on time-series data.
Args:
ts_data: Time-series DataFrame with 'ds' and 'y' columns
train_size: Number of samples for training
Returns:
dict: Model results dictionary
"""
if not ARIMA_AVAILABLE:
return None
print("\n" + "="*60)
print("TRAINING ARIMA MODEL")
print("="*60)
try:
# Split data chronologically
train_data = ts_data['y'].iloc[:train_size].values
val_data = ts_data['y'].iloc[train_size:].values
val_dates = ts_data['ds'].iloc[train_size:].values
print(f"Training on {len(train_data)} samples")
print(f"Validating on {len(val_data)} samples")
# Try different ARIMA orders (p, d, q)
# Start with auto_arima-like approach - try common orders
best_aic = np.inf
best_order = None
best_model = None
# Common ARIMA orders to try
orders_to_try = [
(1, 1, 1), # Standard ARIMA(1,1,1)
(2, 1, 2), # ARIMA(2,1,2)
(1, 1, 0), # ARIMA(1,1,0) - AR model
(0, 1, 1), # ARIMA(0,1,1) - MA model
(2, 1, 1), # ARIMA(2,1,1)
(1, 1, 2), # ARIMA(1,1,2)
]
print("Trying different ARIMA orders...")
for order in orders_to_try:
try:
model = ARIMA(train_data, order=order)
fitted_model = model.fit()
aic = fitted_model.aic
if aic < best_aic:
best_aic = aic
best_order = order
best_model = fitted_model
print(f" Order {order}: AIC = {aic:.2f} (best so far)")
else:
print(f" Order {order}: AIC = {aic:.2f}")
except Exception as e:
print(f" Order {order}: Failed - {str(e)[:50]}")
continue
if best_model is None:
print("Failed to fit ARIMA model with any order")
return None
print(f"\nBest ARIMA order: {best_order} (AIC: {best_aic:.2f})")
# Make predictions
forecast_steps = len(val_data)
forecast = best_model.forecast(steps=forecast_steps)
# Ensure predictions are non-negative
forecast = np.maximum(forecast, 0)
# Calculate metrics
mae = mean_absolute_error(val_data, forecast)
rmse = np.sqrt(mean_squared_error(val_data, forecast))
r2 = r2_score(val_data, forecast)
print(f" MAE: {mae:.2f}, RMSE: {rmse:.2f}, R2: {r2:.4f}")
return {
'model': best_model,
'order': best_order,
'mae': mae,
'rmse': rmse,
'r2': r2,
'predictions': forecast,
'actual': val_data,
'dates': val_dates
}
except Exception as e:
print(f"Error training ARIMA: {str(e)}")
return None
def train_prophet(ts_data, train_size):
"""
Train Prophet model on time-series data.
Args:
ts_data: Time-series DataFrame with 'ds' and 'y' columns
train_size: Number of samples for training
Returns:
dict: Model results dictionary
"""
if not PROPHET_AVAILABLE:
return None
print("\n" + "="*60)
print("TRAINING PROPHET MODEL")
print("="*60)
try:
# Split data chronologically
train_data = ts_data.iloc[:train_size].copy()
val_data = ts_data.iloc[train_size:].copy()
print(f"Training on {len(train_data)} samples")
print(f"Validating on {len(val_data)} samples")
# Initialize and fit Prophet model
# Enable daily seasonality and weekly/yearly seasonality
model = Prophet(
daily_seasonality=False, # Disable daily for daily data
weekly_seasonality=True,
yearly_seasonality=True,
seasonality_mode='multiplicative',
changepoint_prior_scale=0.05
)
print("Fitting Prophet model...")
model.fit(train_data)
# Create future dataframe for validation period
future = model.make_future_dataframe(periods=len(val_data), freq='D')
# Make predictions
forecast = model.predict(future)
# Get predictions for validation period
val_forecast = forecast.iloc[train_size:]['yhat'].values
val_actual = val_data['y'].values
# Ensure predictions are non-negative
val_forecast = np.maximum(val_forecast, 0)
# Calculate metrics
mae = mean_absolute_error(val_actual, val_forecast)
rmse = np.sqrt(mean_squared_error(val_actual, val_forecast))
r2 = r2_score(val_actual, val_forecast)
print(f" MAE: {mae:.2f}, RMSE: {rmse:.2f}, R2: {r2:.4f}")
return {
'model': model,
'mae': mae,
'rmse': rmse,
'r2': r2,
'predictions': val_forecast,
'actual': val_actual,
'dates': val_data['ds'].values,
'full_forecast': forecast
}
except Exception as e:
print(f"Error training Prophet: {str(e)}")
import traceback
traceback.print_exc()
return None
def select_best_model(results):
"""
Select the best model based on R2 score (higher is better).
Args:
results: Dictionary containing model results
Returns:
tuple: (best_model_name, best_model, best_metrics)
"""
print("\n" + "="*60)
print("MODEL COMPARISON")
print("="*60)
# Create comparison DataFrame
comparison_data = []
for model_name, metrics in results.items():
comparison_data.append({
'Model': model_name,
'MAE': metrics['mae'],
'RMSE': metrics['rmse'],
'R2 Score': metrics['r2']
})
comparison_df = pd.DataFrame(comparison_data)
print("\nModel Performance Comparison:")
print(comparison_df.to_string(index=False))
# Select best model based on R2 score
best_model_name = max(results.keys(), key=lambda x: results[x]['r2'])
best_model = results[best_model_name]['model']
best_metrics = {
'mae': results[best_model_name]['mae'],
'rmse': results[best_model_name]['rmse'],
'r2': results[best_model_name]['r2']
}
print(f"\n{'='*60}")
print(f"BEST MODEL: {best_model_name}")
print(f"MAE: {best_metrics['mae']:.2f}")
print(f"RMSE: {best_metrics['rmse']:.2f}")
print(f"R2 Score: {best_metrics['r2']:.4f}")
print(f"{'='*60}")
return best_model_name, best_model, best_metrics
def visualize_results(df, results, best_model_name, feature_names):
"""
Create visualizations: demand trends, feature importance, model comparison.
Args:
df: Original DataFrame
results: Model results dictionary
best_model_name: Name of the best model
feature_names: List of feature names
"""
print("\n" + "="*60)
print("GENERATING VISUALIZATIONS")
print("="*60)
# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (12, 6)
# 1. Demand trends over time
print("1. Plotting demand trends over time...")
df['date'] = pd.to_datetime(df['date'])
daily_demand = df.groupby('date')['sales_quantity'].sum().reset_index()
plt.figure(figsize=(14, 6))
plt.plot(daily_demand['date'], daily_demand['sales_quantity'], linewidth=1, alpha=0.7)
plt.title('Total Daily Sales Quantity Over Time', fontsize=16, fontweight='bold')
plt.xlabel('Date', fontsize=12)
plt.ylabel('Total Sales Quantity', fontsize=12)
plt.grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(f'{PLOTS_DIR}/demand_trends.png', dpi=300, bbox_inches='tight')
print(f" Saved: {PLOTS_DIR}/demand_trends.png")
plt.close()
# 2. Monthly average demand
print("2. Plotting monthly average demand...")
df['month_name'] = pd.to_datetime(df['date']).dt.strftime('%B')
monthly_avg = df.groupby('month')['sales_quantity'].mean().reset_index()
month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
monthly_avg['month_name'] = monthly_avg['month'].apply(lambda x: month_names[x-1])
plt.figure(figsize=(12, 6))
plt.bar(monthly_avg['month_name'], monthly_avg['sales_quantity'], color='steelblue', alpha=0.7)
plt.title('Average Sales Quantity by Month', fontsize=16, fontweight='bold')
plt.xlabel('Month', fontsize=12)
plt.ylabel('Average Sales Quantity', fontsize=12)
plt.xticks(rotation=45)
plt.grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.savefig(f'{PLOTS_DIR}/monthly_demand.png', dpi=300, bbox_inches='tight')
print(f" Saved: {PLOTS_DIR}/monthly_demand.png")
plt.close()
# 3. Feature importance (for tree-based models)
print("3. Plotting feature importance...")
best_model = results[best_model_name]['model']
if hasattr(best_model, 'feature_importances_'):
importances = best_model.feature_importances_
feature_importance_df = pd.DataFrame({
'feature': feature_names,
'importance': importances
}).sort_values('importance', ascending=False)
plt.figure(figsize=(10, 6))
plt.barh(feature_importance_df['feature'], feature_importance_df['importance'], color='coral', alpha=0.7)
plt.title(f'Feature Importance - {best_model_name}', fontsize=16, fontweight='bold')
plt.xlabel('Importance', fontsize=12)
plt.ylabel('Feature', fontsize=12)
plt.gca().invert_yaxis()
plt.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.savefig(f'{PLOTS_DIR}/feature_importance.png', dpi=300, bbox_inches='tight')
print(f" Saved: {PLOTS_DIR}/feature_importance.png")
plt.close()
else:
print(" Feature importance not available for this model type")
# 4. Model comparison
print("4. Plotting model comparison...")
model_names = list(results.keys())
mae_scores = [results[m]['mae'] for m in model_names]
rmse_scores = [results[m]['rmse'] for m in model_names]
r2_scores = [results[m]['r2'] for m in model_names]
# Separate ML and time-series models for visualization
ml_models = [m for m in model_names if m not in ['ARIMA', 'Prophet']]
ts_models = [m for m in model_names if m in ['ARIMA', 'Prophet']]
fig, axes = plt.subplots(1, 3, figsize=(18, 5))
# Color code: ML models in blue tones, TS models in orange/red tones
colors = []
for m in model_names:
if m in ts_models:
colors.append('coral' if m == 'ARIMA' else 'salmon')
else:
colors.append('skyblue')
# MAE comparison
axes[0].bar(model_names, mae_scores, color=colors, alpha=0.7)
axes[0].set_title('MAE Comparison (Lower is Better)', fontsize=14, fontweight='bold')
axes[0].set_ylabel('MAE', fontsize=12)
axes[0].tick_params(axis='x', rotation=45)
axes[0].grid(True, alpha=0.3, axis='y')
# Add legend
from matplotlib.patches import Patch
legend_elements = [
Patch(facecolor='skyblue', alpha=0.7, label='ML Models'),
Patch(facecolor='coral', alpha=0.7, label='Time-Series Models')
]
axes[0].legend(handles=legend_elements, loc='upper right')
# RMSE comparison
axes[1].bar(model_names, rmse_scores, color=colors, alpha=0.7)
axes[1].set_title('RMSE Comparison (Lower is Better)', fontsize=14, fontweight='bold')
axes[1].set_ylabel('RMSE', fontsize=12)
axes[1].tick_params(axis='x', rotation=45)
axes[1].grid(True, alpha=0.3, axis='y')
# R2 comparison
axes[2].bar(model_names, r2_scores, color=colors, alpha=0.7)
axes[2].set_title('R2 Score Comparison (Higher is Better)', fontsize=14, fontweight='bold')
axes[2].set_ylabel('R2 Score', fontsize=12)
axes[2].tick_params(axis='x', rotation=45)
axes[2].grid(True, alpha=0.3, axis='y')
plt.tight_layout()
plt.savefig(f'{PLOTS_DIR}/model_comparison.png', dpi=300, bbox_inches='tight')
print(f" Saved: {PLOTS_DIR}/model_comparison.png")
plt.close()
# 5. Time-series predictions plot (if time-series models available)
if ts_models:
print("5. Plotting time-series model predictions...")
fig, axes = plt.subplots(len(ts_models), 1, figsize=(14, 6*len(ts_models)))
if len(ts_models) == 1:
axes = [axes]
for idx, model_name in enumerate(ts_models):
if model_name in results and 'dates' in results[model_name]:
dates = pd.to_datetime(results[model_name]['dates'])
actual = results[model_name]['actual']
predictions = results[model_name]['predictions']
axes[idx].plot(dates, actual, label='Actual', linewidth=2, alpha=0.7)
axes[idx].plot(dates, predictions, label='Predicted', linewidth=2, alpha=0.7, linestyle='--')
axes[idx].set_title(f'{model_name} - Actual vs Predicted', fontsize=14, fontweight='bold')
axes[idx].set_xlabel('Date', fontsize=12)
axes[idx].set_ylabel('Sales Quantity', fontsize=12)
axes[idx].legend()
axes[idx].grid(True, alpha=0.3)
plt.tight_layout()
plt.savefig(f'{PLOTS_DIR}/timeseries_predictions.png', dpi=300, bbox_inches='tight')
print(f" Saved: {PLOTS_DIR}/timeseries_predictions.png")
plt.close()
print(" Visualization complete!")
def save_model(model, encoders, scaler, feature_names, best_model_name, best_metrics):
"""
Save the trained model and preprocessing objects.
Args:
model: Trained model
encoders: Dictionary of encoders
scaler: Fitted scaler
feature_names: List of feature names
best_model_name: Name of the best model
best_metrics: Dictionary of metrics
"""
print("\n" + "="*60)
print("SAVING MODEL")
print("="*60)
# Save model
model_path = f'{MODEL_DIR}/best_model.joblib'
joblib.dump(model, model_path)
print(f"Model saved to: {model_path}")
# Save encoders and scaler
preprocessing_path = f'{MODEL_DIR}/preprocessing.joblib'
preprocessing_data = {
'encoders': encoders,
'scaler': scaler,
'feature_names': feature_names
}
joblib.dump(preprocessing_data, preprocessing_path)
print(f"Preprocessing objects saved to: {preprocessing_path}")
# Save model metadata
metadata = {
'model_name': best_model_name,
'metrics': best_metrics,
'feature_names': feature_names,
'saved_at': datetime.now().strftime('%Y-%m-%d %H:%M:%S')
}
import json
metadata_path = f'{MODEL_DIR}/model_metadata.json'
with open(metadata_path, 'w') as f:
json.dump(metadata, f, indent=4)
print(f"Model metadata saved to: {metadata_path}")
def main():
"""
Main function to orchestrate the training pipeline.
"""
print("\n" + "="*60)
print("DEMAND PREDICTION SYSTEM - MODEL TRAINING")
print("ML Models vs Time-Series Models Comparison")
print("="*60)
# Step 1: Load data
df = load_data(DATA_PATH)
# Step 2: Preprocess data
df_processed = preprocess_data(df)
# Step 3: Feature engineering for ML models
X, y, feature_names, encoders, scaler = feature_engineering(df_processed)
# Step 4: Split data for ML models (random split)
print("\n" + "="*60)
print("SPLITTING DATA FOR ML MODELS")
print("="*60)
X_train, X_val, y_train, y_val = train_test_split(
X, y, test_size=0.2, random_state=42
)
print(f"Training set: {X_train.shape[0]} samples")
print(f"Validation set: {X_val.shape[0]} samples")
# Step 5: Train ML models
print("\n" + "="*70)
print("TRAINING MACHINE LEARNING MODELS")
print("="*70)
results = train_models(X_train, y_train, X_val, y_val)
# Step 6: Prepare time-series data
ts_data, train_size = prepare_time_series_data(df_processed)
# Step 7: Train time-series models
print("\n" + "="*70)
print("TRAINING TIME-SERIES MODELS")
print("="*70)
# Train ARIMA
if ARIMA_AVAILABLE:
arima_results = train_arima(ts_data, train_size)
if arima_results:
results['ARIMA'] = arima_results
else:
print("\nARIMA skipped (statsmodels not available)")
# Train Prophet
if PROPHET_AVAILABLE:
prophet_results = train_prophet(ts_data, train_size)
if prophet_results:
results['Prophet'] = prophet_results
else:
print("\nProphet skipped (prophet not available)")
# Step 8: Select best model (across all model types)
best_model_name, best_model, best_metrics = select_best_model(results)
# Step 9: Visualize results
visualize_results(df_processed, results, best_model_name, feature_names)
# Step 10: Save model (only ML models can be saved with preprocessing)
# For time-series models, save separately
if best_model_name not in ['ARIMA', 'Prophet']:
save_model(best_model, encoders, scaler, feature_names, best_model_name, best_metrics)
else:
# Save time-series model separately
print("\n" + "="*60)
print("SAVING TIME-SERIES MODEL")
print("="*60)
ts_model_path = f'{MODEL_DIR}/best_timeseries_model.joblib'
joblib.dump(best_model, ts_model_path)
print(f"Time-series model saved to: {ts_model_path}")
# Also save preprocessing for ML models (in case user wants to use them)
preprocessing_path = f'{MODEL_DIR}/preprocessing.joblib'
preprocessing_data = {
'encoders': encoders,
'scaler': scaler,
'feature_names': feature_names
}
joblib.dump(preprocessing_data, preprocessing_path)
print(f"ML preprocessing objects saved to: {preprocessing_path}")
# Save all results metadata
import json
all_models_metadata = {
'best_model': best_model_name,
'best_metrics': best_metrics,
'all_models': {}
}
for model_name, model_results in results.items():
all_models_metadata['all_models'][model_name] = {
'mae': model_results['mae'],
'rmse': model_results['rmse'],
'r2': model_results['r2']
}
all_models_metadata['saved_at'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
metadata_path = f'{MODEL_DIR}/all_models_metadata.json'
with open(metadata_path, 'w') as f:
json.dump(all_models_metadata, f, indent=4)
print(f"All models metadata saved to: {metadata_path}")
print("\n" + "="*60)
print("TRAINING COMPLETE!")
print("="*60)
print(f"\nBest model: {best_model_name}")
print(f"Model type: {'Time-Series' if best_model_name in ['ARIMA', 'Prophet'] else 'Machine Learning'}")
print(f"Model saved to: {MODEL_DIR}/")
print(f"Visualizations saved to: {PLOTS_DIR}/")
print("\nYou can now use predict.py to make predictions!")
if __name__ == "__main__":
main()